In [1]:
import numpy as np
import os
from random import shuffle
import re
In [2]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()
In [3]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelBinarizer
from tensorflow.contrib import learn
import tensorflow as tf
In [4]:
def load_files(train_files, dev_files, class_file):
data_list = list()
for file in train_files:
data = pd.read_csv(file, encoding='latin-1')
data_list.append(data)
data = pd.DataFrame(np.concatenate(data_list))
x_train = data[3]
x_token = []
for line in x_train:
x_token.append(line.strip().split())
return x_token
In [6]:
train_data = "eclipse/"
train_index = 11
data_dir = "../../data/data_by_ocean/" + train_data
models = "TextHMCNN"
data_results = data_dir + "results/" + models + '/'
In [7]:
class_file = data_results + "class_" + str(train_index) + ".csv"
train_files = [data_dir + str(i) + '.csv' for i in range(train_index)]
test_files = [data_dir +
str(i) + '.csv' for i in range(train_index, train_index + 1)]
x_train = load_files(
train_files, test_files, class_file)
In [10]:
vocabulary = []
for sentence in x_train:
vocabulary.extend(sentence)
import collections
word_counts = collections.Counter(vocabulary).most_common(1000)
In [8]:
from gensim.models import Word2Vec
In [14]:
# trainning word
model_ted = Word2Vec(x_train, size=300, min_count=10)
In [15]:
words_top_ted = [word[0] for word in word_counts]
words_top_vec_ted = model_ted[words_top_ted]
In [16]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(words_top_vec_ted)
In [17]:
p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="word2vec T-SNE for most common words")
source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
x2=words_top_ted_tsne[:,1],
names=words_top_ted))
p.scatter(x="x1", y="x2", size=8, source=source)
labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
text_font_size="8pt", text_color="#555555",
source=source, text_align='center')
p.add_layout(labels)
show(p)
In [23]:
model_ted.wv.save_word2vec_format(fname='eclipse.bin', fvocab='eclipse.vocab', binary=True)
In [24]:
model_ted.wv.save_word2vec_format(fname='eclipse.csv')
In [ ]: