資料前處理
In [1]:
import numpy as np
import random
import tensorflow as tf
In [2]:
import os,sys
spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.3-src.zip'))
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
In [3]:
raw = sc.textFile("/net/account/pixuser/kent/work/pixinterest/grouping/data/article_seq_text_travel/p*")
In [4]:
def filter_stop(ws):
r = []
stop={""}
for w in ws:
if w not in stop:
r.append(w)
return r
doc = raw.map(lambda x : x.split(" ")).map(filter_stop)
In [5]:
TOP_N = 100000
top_term = doc.flatMap(lambda x : x).filter(
lambda x:x not in {"",}).map(
lambda x : (x,1)).reduceByKey(
lambda x,y : x+y).takeOrdered(TOP_N,key=lambda x: -x[1])
In [6]:
word2indx={}
index2word=[w[0] for w in top_term]
for idx,w in enumerate(index2word):
word2indx[w] = idx
In [7]:
def content_indexing(data):
r = []
for w in data:
if w in word2indx:
r.append(word2indx[w])
else:
r.append(-1)
return r
In [8]:
def get_pairs(content):
pairs =[]
for i in range(1,len(content)-1):
if content[i]==-1 or content[i-1]==-1 or content[i+1]==-1: continue
pairs.append((content[i],content[i-1]))
pairs.append((content[i],content[i+1]))
return pairs
In [9]:
term_pair = doc.map(content_indexing).map(get_pairs)
In [10]:
vocabulary_size = TOP_N
In [11]:
import json
def doc2onebatch(content):
data = []
label = []
row = {}
for d,l in content:
data.append(d)
label.append([l])
row['data'] = data
row['label'] = label
return json.dumps(row)
In [12]:
!rm -rvf ./pixnet_word2vec_travel/
In [13]:
term_pair.map(doc2onebatch).saveAsTextFile("./pixnet_word2vec_travel/")
In [14]:
import math
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(xrange(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.
graph = tf.Graph()
with graph.as_default():
# Input data.
train_dataset = tf.placeholder(tf.int32, shape=[None])
train_labels = tf.placeholder(tf.int32, shape=[None, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Variables.
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
softmax_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Model.
# Look up embeddings for inputs.
embed = tf.nn.embedding_lookup(embeddings, train_dataset)
# Compute the softmax loss, using a sample of the negative labels each time.
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
train_labels, num_sampled, vocabulary_size))
# Optimizer.
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
# Compute the similarity between minibatch examples and all embeddings.
# We use the cosine distance:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(
normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
init = tf.initialize_all_variables()
In [15]:
session = tf.Session(graph=graph)
session.run(init)
In [16]:
def train(batch_data,batch_labels):
feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
_, l = session.run([optimizer, loss], feed_dict=feed_dict)
return l
In [17]:
class MySentences(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
if 'crc' in fname : continue
for line in open(os.path.join(self.dirname, fname)):
yield line
In [ ]:
for epoch in range(100):
ms = MySentences("./pixnet_word2vec_travel//")
for idx , line in enumerate(ms):
d = json.loads(line)
data = d['data']
if len(data)==0 : continue
label = d['label']
lost = train(data,label)
if idx % 1000 ==0 :
print idx,lost
# feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
sim = session.run(similarity)
for i in xrange(valid_size):
valid_word = index2word[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k+1]
log = "Nearest to %s:" % valid_word
for k in xrange(top_k):
close_word = index2word[nearest[k]]
log = "%s %s," % (log, close_word)
print log
In [21]:
final_embeddings = session.run(normalized_embeddings)
In [22]:
class PixWord2Vec:
index2word
word2indx
final_embeddings
In [23]:
pixw = PixWord2Vec()
pixw.index2word = index2word
pixw.word2indx = word2indx
pixw.final_embeddings = final_embeddings
import pickle
pickle.dump(pixw, open("./pixword_travel.pk",'w'))
In [34]:
ppixw = pickle.load(open("./pixword.pk"))
In [24]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import urllib
import zipfile
from matplotlib import pylab
from sklearn.manifold import TSNE
Download the data from the source website if necessary.
In [70]:
from random import randint
import plotly.plotly as py
account = []
_account, _pw = account[randint(0, len(account)-1)]
py.sign_in(_account, _pw)
In [87]:
num_points = 1000
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])
In [88]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [90]:
words = [index2word[i] for i in xrange(1, num_points+1)]
In [91]:
import plotly.plotly as py
import plotly.graph_objs as go
x=[]
y=[]
for xvalue,yvalue in two_d_embeddings:
x.append(xvalue)
y.append(yvalue)
import plotly.plotly as py
from plotly.graph_objs import *
# Create a trace
trace = go.Scatter(
x = x,
y = y,
text = words,
mode = 'markers'
)
data = [trace]
# Plot and embed in ipython notebook!
py.iplot(data, filename='basic-scatter')
Out[91]:
In [ ]: