In [1]:
import os,sys
spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.3-src.zip'))
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
In [2]:
class PixWord2Vec:
# vocabulary indexing
index2word = None
word2indx = None
# embeddings vector
embeddings = None
# Normailized embeddings vector
final_embeddings = None
# hidden layer's weight and bias
softmax_weights = None
softmax_biases = None
In [3]:
# 此 Model 檔必需要先 Trainig Word2Vec
import pickle
pixword = pickle.load(open("./pixword_cnn_word2vec.pk"))
In [21]:
import numpy as np
import random
import tensorflow as tf
import json
from pyspark import StorageLevel
In [5]:
vocabulary_size = len(pixword.index2word)
print "vocabulary_size" , vocabulary_size
In [6]:
pixword.embeddings.shape
Out[6]:
In [94]:
import math
append_size = 1000
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
graph = tf.Graph()
with graph.as_default():
np.random.seed(0)
# doc(tags or category) batch size , this is key !!! And this batch size cant be too large !!
append_size = 1000
# Input data.
train_dataset = tf.placeholder(tf.int32, shape=[None])
train_labels = tf.placeholder(tf.int32, shape=[None, 1])
# Variables.
embeddings = tf.Variable(np.append(pixword.embeddings,
np.random.randn(append_size,128)).reshape(vocabulary_size+append_size,128).astype('float32'))
softmax_weights = tf.Variable(np.append(pixword.embeddings,
np.random.randn(append_size,128)).reshape(vocabulary_size+append_size,128).astype('float32'))
softmax_biases = tf.Variable(np.append(pixword.softmax_biases,[0]*append_size).astype('float32'))
# Model.
# Look up embeddings for inputs.
embed = tf.nn.embedding_lookup(embeddings, train_dataset)
# Compute the softmax loss, using a sample of the negative labels each time.
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
train_labels, num_sampled, vocabulary_size))
# Optimizer.
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
# Compute the similarity between minibatch examples and all embeddings.
# We use the cosine distance:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
init = tf.global_variables_initializer()
In [95]:
session = tf.Session(graph=graph)
session.run(init)
return (final_embeddings[vocabulary_size:vocabulary_size+index+1],final_embeddings[:vocabulary_size])
In [61]:
def train(batch_data,batch_labels):
feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
_, l = session.run([optimizer, loss], feed_dict=feed_dict)
return l
In [62]:
def searchByVec(vec,final_embeddings,scope=5):
sim = np.dot(final_embeddings,vec)
for index in sim.argsort()[-scope:][::-1][1:]:
print pixword.index2word[index],sim[index]
In [96]:
cate_vec = []
count = 0
def tags2vec(words_set):
np.random.seed(0)
session.run(init)
if len(words_set)>append_size: raise
cat_data = []
cat_label = []
for index , words in enumerate(words_set):
for w in words :
if w not in pixword.word2indx :
continue
wi = pixword.word2indx[w]
cat_data.append(vocabulary_size+index)
cat_label.append([wi])
for _ in range(20):
train(cat_data,cat_label)
final_embeddings = session.run(normalized_embeddings)
return (final_embeddings[vocabulary_size:vocabulary_size+index+1],final_embeddings[:vocabulary_size])
In [141]:
words = [u'旅遊',u'台東']
avg_vec = np.average([pixword.final_embeddings[pixword.word2indx[w]] for w in words],0)
for w in words:
print "#{}#".format(w.encode('utf-8'))
searchByVec(pixword.final_embeddings[pixword.word2indx[w]] ,pixword.final_embeddings)
print
# 單純取這此字的 Vector Mean
print "AVG Vector"
searchByVec(avg_vec,pixword.final_embeddings,scope=20)
print
# 假設有個一 document 包含這些 tag 字 ,所產生的新的 vecotr 所找的新的關鍵字如下
print "Tag Vector"
result = tags2vec([words])
searchByVec(result[0][0],result[1],scope=20)
In [65]:
# read raw data
def checkInVoc(tlist):
r = []
for t in tlist :
if t in pixword.word2indx:
r.append(t)
return r
def merge(x):
x[0]['tags'] = x[1]
return x[0]
In [22]:
test_set = sc.textFile("./data/cuted_test/").map(
json.loads).map(
lambda x : (x,x['tags']) ).mapValues(
checkInVoc).filter(
lambda x : len(x[1])>1)
test_set.persist(StorageLevel.DISK_ONLY)
Out[22]:
In [24]:
!rm -rvf ./data/cuted_and_tags/
import json
test_set.map(merge).map(json.dumps).saveAsTextFile("./data/cuted_and_tags/")
In [66]:
class MySentences(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
if 'crc' in fname : continue
if fname.startswith('_'):continue
for line in open(os.path.join(self.dirname, fname)):
yield line
In [26]:
sc.textFile("./data/cuted_and_tags/").count()
Out[26]:
In [72]:
def toVector(docs,tags_set,f):
res_vecs = tags2vec(tags_set)
if len(docs) != len(res_vecs[0]):
print len(docs) , len(res_vecs)
raise
for index,d in enumerate(docs):
d['tag_vec'] = [float(i) for i in list(res_vecs[0][index])]
for d in docs:
jstr = json.dumps(d)
f.write(jstr+'\n')
In [98]:
!rm ./data/cuted_and_vec.json
f = open('./data/cuted_and_vec.json','w')
docs = []
tags_set = []
for doc in MySentences("./data/cuted_and_tags/"):
js_objects = json.loads(doc)
docs.append(js_objects)
tags_set.append(js_objects['tags'])
if len(docs) == 1000:
toVector(docs,tags_set,f)
docs = []
tags_set = []
print '*',
toVector(docs,tags_set,f)
In [99]:
def loadjson(x):
try:
return json.loads(x)
except:
return None
In [100]:
jsondoc = sc.textFile(
"./data/cuted_and_vec.json").map(
loadjson).filter(
lambda x : x!=None)
In [101]:
from operator import add
In [102]:
import json
def loadjson(x):
try:
return json.loads(x)
except:
return None
url_vecs = np.array(jsondoc.map(
lambda x: np.array(x['tag_vec'])).collect())
In [103]:
url_vecs.shape
Out[103]:
In [104]:
urls = jsondoc.collect()
In [130]:
def search(wvec,final_embeddings,cate):
# wvec = final_embeddings[windex]
sim = np.dot(final_embeddings,wvec)
result = []
for index in sim.argsort()[-1000:][::-1][1:]:
if urls[index]['category'] == cate and sim[index]>0.9 :
print urls[index]['url'],sim[index],
for tag in urls[index]['tags']:
print tag,
print
return sim
In [140]:
index = np.random.randint(10000)
print urls[index]['url'],urls[index]['category'],
for tag in urls[index]['tags']:
print tag,
print
print
print "########以下是用 Tag Vecotr 所找出來的 URL #########"
sim = search(url_vecs[index],url_vecs,urls[index]['category'])
print
print
print "########以下是直接用第一個 Tag 直接作比對的結果,效果好非常多 #########"
count = 0
for _,u in enumerate(urls):
for t in u['tags']:
if t == urls[index]['tags'][0] :
count = count + 1
print u['url']
for tt in u['tags']:
print tt,
print
break
if count > 500 : break
In [ ]: