In [ ]:
####导入模块
import json
import os
import numpy as np
import time
import datetime
import random
import tensorflow as tf
from six.moves import cPickle as pickle

In [ ]:
##将已有的词向量字典文件load
dir = "????/kaggle/data"
wordvecfile = "wordsvector.txt"
path = os.path.join(dir, wordvecfile)
fd = open(path)
line = fd.readline()
wordvecdic = json.loads(line)
fd.close()

In [ ]:
###打印时间函数  用于函数调用
def log_time(s):
    now = datetime.datetime.now()
    print "[%s][%s.%s][%s]" %(time.strftime('%Y-%m-%d %H-%M-%S', time.localtime(time.time())), now.second, now.microsecond, s)

In [ ]:
####句(词list)转为句向量  用于函数调用
def sentence_vecor(wordlist):
    vector = []
    for word in wordlist:
        try:
            vector.append(wordvecdic[word])
        except Exception,e:
            #log_time(e)
            vector.append(np.random.uniform(-0.25,0.25,50).tolist())
    return vector

In [ ]:
###文件转为数据,返回为多个多维数组 node为训练(&验证)与 test数据标识
###node == train:返回dataset1, dataset2, labels 
###node == test: 返回dataset1, dataset2
def generate_dataset(datafile, samplesize, node):
    fd = open(datafile)
    dataset1 = np.zeros((samplesize, 300, 50), dtype=np.float32)
    dataset2 = np.zeros((samplesize, 300, 50), dtype=np.float32)
    if node == "train":
        labels = np.zeros(samplesize, dtype = np.int32)
        i = 0
        for line in fd:
            data = line.split("\t")
            question1, question2, is_duplicate = json.loads(data[0]), json.loads(data[1]), data[2]
            dataset1[i][:len(question1)] = sentence_vecor(question1)
            dataset2[i][:len(question2)] = sentence_vecor(question2)
            labels[i] = int(is_duplicate)
            i += 1
        fd.close()
        return dataset1, dataset2, labels
    elif node == "test":
        dataset1 = np.zeros((samplesize, 300, 50), dtype=np.float32)
        dataset2 = np.zeros((samplesize, 300, 50), dtype=np.float32)
        i = 0
        for line in fd:
            data = line.split("\t")
            question1, question2 = json.loads(data[0]), json.loads(data[1])
            dataset1[i][:len(question1)] = sentence_vecor(question1)
            dataset2[i][:len(question2)] = sentence_vecor(question2)
            i += 1
        fd.close()
        return dataset1, dataset2

In [ ]:
##输入是filepath,code("train":数据集为dataset1,dataset2,labels "test":数据集为dataset1,dataset2)
##例如: (/home/lixuelian/lillian/kaggle/data/test, "test") 将该路径下的所有文件置换为数据集,用pickle.load反序列化字典对象
def filepath_dataset(filepath, code):
    log_time("begin")
    files = os.listdir(filepath)
    if code == "train":
        for trainfile in files:
            trainfile = os.path.join(filepath, trainfile)
            f = open(trainfile)
            column = len(f.readlines())
            f.close()
            try:
                train_dataset1, train_dataset2, train_labels = generate_dataset(trainfile, column, "train")
                ##替换掉
                if os.path.exists(trainfile): os.remove(trainfile)
                f = open(trainfile, 'wb')
                save = {'train_dataset1': train_dataset1, 'train_dataset2': train_dataset2, 'train_labels': train_labels}
                pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
                f.close()
                del train_dataset1, train_dataset2, train_labels
            except Exception as e:
                log_time(trainfile + ": is not succesed")
    elif code == "test":
        for testfile in files:
            testfile = os.path.join(filepath, testfile)
            f = open(testfile)
            column = len(f.readlines())
            f.close()
            try:
                test_dataset1, test_dataset2 = generate_dataset(testfile, column, "test")
                ###替换掉
                if os.path.exists(testfile): os.remove(testfile)
                f = open(testfile, 'wb')
                save = {'test_dataset1': test_dataset1, 'test_dataset2': test_dataset2}
                pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
                f.close()
                del test_dataset1, test_dataset2
            except Exception as e:
                log_time(testfile + ": is not succesed")
    log_time("end")

In [ ]:
###数组字典文件 load
def load_dataset(filename, node):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
        try:
            if node == "train":
                ##第一个句子的样本矩阵 为np.ndarray((samplesize, 300, 50), dtype=np.float32))
                train_dataset1 = data['train_dataset1'] 
                ##第二个句子的样本矩阵 为np.ndarray((samplesize, 300, 50), dtype=np.float32))
                train_dataset2 = data['train_dataset2']
                ##类别标识 np.ndarray((samplesize, dtype = np.int32))
                train_labels = data['train_labels']
                return train_dataset1, train_dataset2, train_labels
            elif node == "test":
                test_dataset1 = data["test_dataset1"]
                test_dataset2 = data["test_dataset2"]
                return test_dataset1, test_dataset2
        except:
            log_time(filename + ":load failed")