In [ ]:
####导入模块
import json
import os
import numpy as np
import time
import datetime
import random
import tensorflow as tf
from six.moves import cPickle as pickle
In [ ]:
##将已有的词向量字典文件load
dir = "????/kaggle/data"
wordvecfile = "wordsvector.txt"
path = os.path.join(dir, wordvecfile)
fd = open(path)
line = fd.readline()
wordvecdic = json.loads(line)
fd.close()
In [ ]:
###打印时间函数 用于函数调用
def log_time(s):
now = datetime.datetime.now()
print "[%s][%s.%s][%s]" %(time.strftime('%Y-%m-%d %H-%M-%S', time.localtime(time.time())), now.second, now.microsecond, s)
In [ ]:
####句(词list)转为句向量 用于函数调用
def sentence_vecor(wordlist):
vector = []
for word in wordlist:
try:
vector.append(wordvecdic[word])
except Exception,e:
#log_time(e)
vector.append(np.random.uniform(-0.25,0.25,50).tolist())
return vector
In [ ]:
###文件转为数据,返回为多个多维数组 node为训练(&验证)与 test数据标识
###node == train:返回dataset1, dataset2, labels
###node == test: 返回dataset1, dataset2
def generate_dataset(datafile, samplesize, node):
fd = open(datafile)
dataset1 = np.zeros((samplesize, 300, 50), dtype=np.float32)
dataset2 = np.zeros((samplesize, 300, 50), dtype=np.float32)
if node == "train":
labels = np.zeros(samplesize, dtype = np.int32)
i = 0
for line in fd:
data = line.split("\t")
question1, question2, is_duplicate = json.loads(data[0]), json.loads(data[1]), data[2]
dataset1[i][:len(question1)] = sentence_vecor(question1)
dataset2[i][:len(question2)] = sentence_vecor(question2)
labels[i] = int(is_duplicate)
i += 1
fd.close()
return dataset1, dataset2, labels
elif node == "test":
dataset1 = np.zeros((samplesize, 300, 50), dtype=np.float32)
dataset2 = np.zeros((samplesize, 300, 50), dtype=np.float32)
i = 0
for line in fd:
data = line.split("\t")
question1, question2 = json.loads(data[0]), json.loads(data[1])
dataset1[i][:len(question1)] = sentence_vecor(question1)
dataset2[i][:len(question2)] = sentence_vecor(question2)
i += 1
fd.close()
return dataset1, dataset2
In [ ]:
##输入是filepath,code("train":数据集为dataset1,dataset2,labels "test":数据集为dataset1,dataset2)
##例如: (/home/lixuelian/lillian/kaggle/data/test, "test") 将该路径下的所有文件置换为数据集,用pickle.load反序列化字典对象
def filepath_dataset(filepath, code):
log_time("begin")
files = os.listdir(filepath)
if code == "train":
for trainfile in files:
trainfile = os.path.join(filepath, trainfile)
f = open(trainfile)
column = len(f.readlines())
f.close()
try:
train_dataset1, train_dataset2, train_labels = generate_dataset(trainfile, column, "train")
##替换掉
if os.path.exists(trainfile): os.remove(trainfile)
f = open(trainfile, 'wb')
save = {'train_dataset1': train_dataset1, 'train_dataset2': train_dataset2, 'train_labels': train_labels}
pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()
del train_dataset1, train_dataset2, train_labels
except Exception as e:
log_time(trainfile + ": is not succesed")
elif code == "test":
for testfile in files:
testfile = os.path.join(filepath, testfile)
f = open(testfile)
column = len(f.readlines())
f.close()
try:
test_dataset1, test_dataset2 = generate_dataset(testfile, column, "test")
###替换掉
if os.path.exists(testfile): os.remove(testfile)
f = open(testfile, 'wb')
save = {'test_dataset1': test_dataset1, 'test_dataset2': test_dataset2}
pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()
del test_dataset1, test_dataset2
except Exception as e:
log_time(testfile + ": is not succesed")
log_time("end")
In [ ]:
###数组字典文件 load
def load_dataset(filename, node):
with open(filename, 'rb') as f:
data = pickle.load(f)
try:
if node == "train":
##第一个句子的样本矩阵 为np.ndarray((samplesize, 300, 50), dtype=np.float32))
train_dataset1 = data['train_dataset1']
##第二个句子的样本矩阵 为np.ndarray((samplesize, 300, 50), dtype=np.float32))
train_dataset2 = data['train_dataset2']
##类别标识 np.ndarray((samplesize, dtype = np.int32))
train_labels = data['train_labels']
return train_dataset1, train_dataset2, train_labels
elif node == "test":
test_dataset1 = data["test_dataset1"]
test_dataset2 = data["test_dataset2"]
return test_dataset1, test_dataset2
except:
log_time(filename + ":load failed")