資料前處理


In [1]:
import numpy as np
import random
import tensorflow as tf

In [2]:
import os,sys
spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.3-src.zip'))
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))


Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.0.2
      /_/

Using Python version 2.7.9 (default, Jun 29 2016 13:08:31)
SparkSession available as 'spark'.

In [3]:
raw = sc.textFile("/net/account/pixuser/kent/work/pixinterest/grouping/data/article_seq_text/part*")

In [4]:
def filter_stop(ws):
    r = []
    stop={""}
    for w in ws:
        if w not in stop:
            r.append(w)
    return r
            
doc = raw.map(lambda x : x.split(" ")).map(filter_stop)

In [5]:
TOP_N = 100000
top_term = doc.flatMap(lambda x : x).filter(
    lambda x:x not in {"",}).map(
    lambda x : (x,1)).reduceByKey(
    lambda x,y : x+y).takeOrdered(TOP_N,key=lambda x: -x[1])

build indexing


In [6]:
word2indx={}
index2word=[w[0] for w in top_term]
for idx,w in enumerate(index2word):
    word2indx[w] = idx

In [7]:
def content_indexing(data):
    r = []
    for w in data:
        if w in word2indx:
            r.append(word2indx[w])
        else:
            r.append(-1)
    return r

In [8]:
def get_pairs(content):
    pairs =[]
    for i in range(1,len(content)-1):
        if content[i]==-1 or content[i-1]==-1 or content[i+1]==-1: continue
        pairs.append((content[i],content[i-1]))
        pairs.append((content[i],content[i+1]))
    return pairs

In [9]:
term_pair = doc.map(content_indexing).map(get_pairs)

In [10]:
vocabulary_size = TOP_N

In [11]:
import json
def doc2onebatch(content):
    data = []
    label = []
    row = {}
    for d,l in content:
        data.append(d)
        label.append([l])
    row['data'] = data
    row['label'] = label
    
    return json.dumps(row)

In [12]:
!rm -rvf ./pixnet_word2vec/

In [ ]:
term_pair.map(doc2onebatch).saveAsTextFile("./pixnet_word2vec/")

設計 Graph


In [14]:
import math

batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(xrange(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[None])
  train_labels = tf.placeholder(tf.int32, shape=[None, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
                               train_labels, num_sampled, vocabulary_size))

  # Optimizer.
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
  init = tf.initialize_all_variables()


WARNING:tensorflow:From <ipython-input-14-a0ea8686b866>:50 in <module>.: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.

In [15]:
session = tf.Session(graph=graph)
session.run(init)

In [16]:
def train(batch_data,batch_labels):
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    return l

In [17]:
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            if 'crc' in fname : continue
            for line in open(os.path.join(self.dirname, fname)):
                yield line

In [ ]:
for epoch in range(10):
    ms = MySentences("./pixnet_word2vec/")
    for idx , line in enumerate(ms):
        d = json.loads(line)
        data = d['data']
        if len(data)==0 : continue
        label = d['label']
        lost = train(data,label)
        if idx % 1000 ==0 : 
            print idx,lost

    #         feed_dict = {train_dataset : batch_data, train_labels : batch_labels}

            sim = session.run(similarity)
            for i in xrange(valid_size):
                valid_word = index2word[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = "Nearest to %s:" % valid_word
                for k in xrange(top_k):
                  close_word = index2word[nearest[k]]
                  log = "%s %s," % (log, close_word)
                print log

In [26]:
final_embeddings = session.run(normalized_embeddings)

In [30]:
class PixWord2Vec:
    index2word
    word2indx
    final_embeddings

In [33]:
pixw = PixWord2Vec()
pixw.index2word = index2word
pixw.word2indx = word2indx
pixw.final_embeddings = final_embeddings

import pickle
pickle.dump(pixw, open("./pixword.pk",'w'))

In [34]:
ppixw = pickle.load(open("./pixword.pk"))

In [38]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import urllib
import zipfile
from matplotlib import pylab
from sklearn.manifold import TSNE

Download the data from the source website if necessary.


In [70]:
from random import randint
import plotly.plotly as py

account = []

_account, _pw = account[randint(0, len(account)-1)]
py.sign_in(_account, _pw)

In [87]:
num_points = 1000

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])

In [88]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [90]:
words = [index2word[i] for i in xrange(1, num_points+1)]

In [91]:
import plotly.plotly as py
import plotly.graph_objs as go

x=[]
y=[]
for xvalue,yvalue in two_d_embeddings:
    x.append(xvalue)
    y.append(yvalue)
import plotly.plotly as py
from plotly.graph_objs import *

# Create a trace
trace = go.Scatter(
    x = x,
    y = y,
    text = words,
    mode = 'markers'
)


data = [trace]

# Plot and embed in ipython notebook!
py.iplot(data, filename='basic-scatter')


Out[91]:

In [ ]: