In [ ]:
import os
import sys

def download(name, model, iters):
    trained_policy = '{}-{}.iter={}.npz'.format(name, model, iters)
    if not os.path.exists('.policy/{}'.format(trained_policy)):
        print 'download the model from HPC server'
        os.system('scp mercer:/scratch/jg5223/exp/.policy/{} .policy/'.format(trained_policy))
    print 'done'
# m ='model_wmt15_bpe2k_uni_en-ru_rev.npz'    
# download('160920-225547', m, '44400')
# download('160920-225339', m, '33600') 
# download('160920-230514', m, '95600')
# download('160920-225917', m, '68000')
# download('160920-230503', m, '48200')
# download('160920-225454', m, '45200')

# m ='model_wmt15_bpe2k_uni_en-de_rev.npz' 
# download('160925-000643', m, '24400')
# download('160925-000657', m, '10400')
# download('160925-000642', m, '8000') 
# download('160925-000735', m, '22200')
# download('160925-000751', m, '26200')
# download('160925-000811', m, '18800')

# max model 172 160917-175503 Iter=34400 AP=0.3 0.14121125914
# max model 262 160917-001117 Iter=52400 AP=0.5 0.157379802964
# max model 141 160917-175423 Iter=28200 AP=0.7 0.15857511208
# max model 67 160917-175657 Iter=13400 CW=5.0 0.160038686998
m ='model_wmt15_bpe2k_uni_en-de.npz' 
download('160917-175503', m, '34400')
download('160917-001117', m, '52400')
download('160917-175423', m, '28200') 
download('160917-175657', m, '13400')

In [7]:
# %load simultrans_train.py
"""
Simultaneous Machine Translateion: Training with Policy Gradient

"""
import argparse
import os
import cPickle as pkl

from bleu import *
from nmt_uni import *
from policy import Controller as Policy
from utils import Progbar, Monitor
from data_iterator import check_length, iterate

from simultrans_model_clean import simultaneous_decoding
from simultrans_model_clean import _seqs2words, _bpe2words, _padding
from config import rl_config
    
import time

numpy.random.seed(19920206)
timer = time.time


config       = rl_config()
options_file = config['option']
model        = config['model']
WORK         = config['workspace']
id, remote   = None, False

# check hidden folders
paths = ['.policy', '.pretrained', '.log', '.config', '.images', '.translate']
for p in paths:
    p = WORK + p
    if not os.path.exists(p):
        os.mkdir(p)

if id is not None:
    fcon = WORK + '.config/{}.conf'.format(id)
    if os.path.exists(fcon):
        print 'load config files'
        policy, config = pkl.load(open(fcon, 'r'))

# ============================================================================== #
# load model model_options
# ============================================================================== #
_model = model.split('/')[-1]

if options_file is not None:
    with open(options_file, 'rb') as f:
        options = pkl.load(f)
else:
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

print 'merge configuration into options'
for w in config:
    # if (w in options) and (config[w] is not None):
    options[w] = config[w]

print 'load options...'
for w, p in sorted(options.items(), key=lambda x: x[0]):
    print '{}: {}'.format(w, p)

# load detail settings from option file:
dictionary, dictionary_target = options['dictionaries']

# load source dictionary and invert
with open(dictionary, 'rb') as f:
    word_dict = pkl.load(f)
word_idict = dict()
for kk, vv in word_dict.iteritems():
    word_idict[vv] = kk
word_idict[0] = '<eos>'
word_idict[1] = 'UNK'

# load target dictionary and invert
with open(dictionary_target, 'rb') as f:
    word_dict_trg = pkl.load(f)
word_idict_trg = dict()
for kk, vv in word_dict_trg.iteritems():
    word_idict_trg[vv] = kk
word_idict_trg[0] = '<eos>'
word_idict_trg[1] = 'UNK'

options['pre'] = config['pre']

# ========================================================================= #
# Build a Simultaneous Translator
# ========================================================================= #

# allocate model parameters
params = init_params(options)
params = load_params(model, params)
tparams = init_tparams(params)

# print 'build the model for computing cost (full source sentence).'
trng, use_noise, \
_x, _x_mask, _y, _y_mask, \
opt_ret, \
cost, f_cost = build_model(tparams, options)
print 'done'

# functions for sampler
f_sim_ctx, f_sim_init, f_sim_next = build_simultaneous_sampler(tparams, options, trng)

# function for finetune the underlying model
if options['finetune']:
    ff_init, ff_cost, ff_update = build_simultaneous_model(tparams, options, rl=True)
    funcs = [f_sim_ctx, f_sim_init, f_sim_next, f_cost, ff_init, ff_cost, ff_update]

else:
    funcs = [f_sim_ctx, f_sim_init, f_sim_next, f_cost]


# check the ID:
options['base'] = _model
agent     = Policy(trng, options,
                   n_in=options['readout_dim'] + 1 if options['coverage'] else options['readout_dim'],
                   n_out=3 if config['forget'] else 2,
                   recurrent=options['recurrent'], id=id)

# make the dataset ready for training & validation
trainIter = TextIterator(options['datasets'][0], options['datasets'][1],
                         options['dictionaries'][0], options['dictionaries'][1],
                         n_words_source=options['n_words_src'], n_words_target=options['n_words'],
                         batch_size=config['batchsize'],
                         maxlen=options['maxlen'])

train_num = trainIter.num

validIter = TextIterator(options['valid_datasets'][0], options['valid_datasets'][1],
                         options['dictionaries'][0], options['dictionaries'][1],
                         n_words_source=options['n_words_src'], n_words_target=options['n_words'],
                         batch_size=20, cache=10,
                         maxlen=1000000)

valid_num = validIter.num
print 'training set {} lines / validation set {} lines'.format(train_num, valid_num)
print 'use the reward function {}'.format(chr(config['Rtype'] + 65))


Using gpu device 3: GeForce GTX TITAN X (CNMeM is disabled, cuDNN 5005)
merge configuration into options
load options...
Rtype: 10
act_mask: True
alpha_c: 0.0
batch_size: 64
batchsize: 10
clip_c: 1.0
coverage: False
datasets: ['/misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/un16/train.un16.en-zh.en.c0.tok.clean.bpe20k.np', '/misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/un16/train.un16.en-zh.zh.c0.tok.clean.bpe20k.np']
decay_c: 0.0
decoder: gru_cond
dictionaries: ['/misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/un16/train.un16.en-zh.en.c0.tok.clean.bpe20k.vocab.pkl', '/misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/un16/train.un16.en-zh.zh.c0.tok.clean.bpe20k.vocab.pkl']
dim: 1028
dim_word: 512
dispFreq: 50
encoder: gru
finetune: True
finish_after: 10000000
forget: False
full_att: False
gamma: 1
layernorm: False
lr_model: 2e-05
lr_policy: 0.0002
lrate: 0.0001
max_epochs: 5000
maxlen: 50
maxsrc: 10
model: /misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/.pretrained/model_un16_bpe2k_uni_en-zh.npz
n_words: 20000
n_words_src: 20000
optimizer: adadelta
option: /misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/.pretrained/model_un16_bpe2k_uni_en-zh.npz.pkl
overwrite: False
patience: 1000
peek: 1
pre: False
prop: 0.5
recurrent: True
reload_: False
rl_maxlen: 100
s0: 1
sample: 10
sampleFreq: 99
saveFreq: 1000
saveto: .pretraining/model_un16_bpe2k_uni_en-zh.npz
step: 1
target_ap: 0.8
target_cw: 8
updater: REINFORCE
upper: False
use_dropout: False
validFreq: 1000
valid_batch_size: 64
valid_datasets: ['/misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/un16/devset.un16.en-zh.en.c0.tok.bpe20k.np', '/misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/un16/devset.un16.en-zh.zh.c0.tok.bpe20k.np']
workspace: /misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/
loading Wemb: (20000, 512)
loading Wemb_dec: (20000, 512)
loading encoder_W: (512, 2056)
loading encoder_b: (2056,)
loading encoder_U: (1028, 2056)
loading encoder_Wx: (512, 1028)
loading encoder_bx: (1028,)
loading encoder_Ux: (1028, 1028)
loading ff_state_W: (1028, 1028)
loading ff_state_b: (1028,)
loading decoder_W: (512, 2056)
loading decoder_b: (2056,)
loading decoder_U: (1028, 2056)
loading decoder_Wx: (512, 1028)
loading decoder_Ux: (1028, 1028)
loading decoder_bx: (1028,)
loading decoder_U_nl: (1028, 2056)
loading decoder_b_nl: (2056,)
loading decoder_Ux_nl: (1028, 1028)
loading decoder_bx_nl: (1028,)
loading decoder_Wc: (1028, 2056)
loading decoder_Wcx: (1028, 1028)
loading decoder_W_comb_att: (1028, 1028)
loading decoder_Wc_att: (1028, 1028)
loading decoder_b_att: (1028,)
loading decoder_U_att: (1028, 1)
loading decoder_c_tt: (1,)
loading ff_logit_lstm_W: (1028, 512)
loading ff_logit_lstm_b: (512,)
loading ff_logit_prev_W: (512, 512)
loading ff_logit_prev_b: (512,)
loading ff_logit_ctx_W: (1028, 512)
loading ff_logit_ctx_b: (512,)
loading ff_logit_W: (512, 20000)
loading ff_logit_b: (20000,)
done
Building f_ctx/init... Done.
Building f_sim_next.. with normal input
Done.
compile the initializer
encoder done.
build REINFORCE optimizer for the whole NMT model:
build optimizer with Adam
done.
parameter initialization
building a recurrent controller
start from a new model: 170309-053853
build action sampling function [Discrete]
build action distribiution
setup the advantages & baseline network
build advantages and baseline gradient
build optimizer with Adam
build RENIFROCE.
build REINFORCE optimizer
build optimizer with Adam
done
policy network
policy_net_in_W (2568, 256)
policy_net_in_b (256,)
policy_net_in_U (128, 256)
policy_net_in_Wx (2568, 128)
policy_net_in_bx (128,)
policy_net_in_Ux (128, 128)
policy_net_out_W (128, 2)
policy_net_out_b (2,)
scan the dataset.
scanned 12394614 lines
scan the dataset.
scanned 3999 lines
training set 12394614 lines / validation set 3999 lines
use the reward function K

In [44]:
# ================================================================================= #
# Main Loop: Run
# ================================================================================= #
print 'Start Simultaneous Translator...'
monitor = None
if remote:
    monitor = Monitor(root='http://localhost:9000')

# freqs
save_freq     = 200
sample_freq   = 10
valid_freq    = 200
valid_size    = 200
display_freq  = 50
finetune_freq = 5

history, last_it = agent.load()
action_space = ['W', 'C', 'F']
Log_avg = {}
time0 = timer()

pipe = OrderedDict()
for key in ['x', 'x_mask', 'y', 'y_mask', 'c_mask']:
    pipe[key] = []

def _translate(src, trg, samples=None, train=False,
               greedy=False, show=False, full=False):
    time0 = time.time()
    if full:
        options1 = copy.copy(options)
        options1['finetune'] = False
    else:
        options1 = options

    ret   = simultaneous_decoding(
            funcs, agent, options1,
            src, trg, word_idict_trg,
            samples, greedy, train)

    if show:
        info   = ret[1]
        values = [(w, float(info[w])) for w in info if w != 'advantages']
        print ' , '.join(['{}={:.3f}'.format(k, f) for k, f in values]),
        print '...{}s'.format(time.time() - time0)

    return ret


Start Simultaneous Translator...

In [9]:
srcs, trgs = trainIter.next()

In [10]:
for s in srcs:
    print len(s),


43 60 44 42 47 44 51 43 41 45

In [45]:
# training set sentence tuning
new_srcs, new_trgs = [], []
for src, trg in zip(srcs, trgs):
    if len(src) <= options['s0']:
        continue  # ignore when the source sentence is less than sidx.
    else:
        new_srcs += [src]
        new_trgs += [trg]

srcs, trgs = new_srcs, new_trgs
statistics, info = _translate(srcs, trgs, train=True, show=True, greedy=False)


Adv=0.197 , B_loss=0.014 , p(COMMIT)=0.454 , p(WAIT)=0.546 , StartR=0.061 , J=-0.111 , Delay=0.611 , a_cost=-4.762 , Entropy=-0.027 , Quality=0.034 ...7.75953602791s

In [46]:
for a in statistics['attentions']:
    print '{}: {}'.format(len(a), a[0].shape),


3: (61,) 2: (61,) 2: (61,) 3: (61,) 3: (61,) 4: (61,) 4: (61,) 3: (61,) 4: (61,) 6: (61,) 5: (61,) 5: (61,) 5: (61,) 5: (61,) 9: (61,) 7: (61,) 6: (61,) 7: (61,) 8: (61,) 9: (61,) 11: (61,) 13: (61,) 10: (61,) 10: (61,) 11: (61,) 10: (61,) 8: (61,) 9: (61,) 8: (61,) 10: (61,) 15: (61,) 13: (61,) 14: (61,) 18: (61,) 13: (61,) 15: (61,) 13: (61,) 13: (61,) 15: (61,) 11: (61,) 16: (61,) 14: (61,) 15: (61,) 11: (61,) 16: (61,) 12: (61,) 16: (61,) 12: (61,) 19: (61,) 18: (61,) 13: (61,) 17: (61,) 15: (61,) 19: (61,) 18: (61,) 14: (61,) 16: (61,) 17: (61,) 15: (61,) 18: (61,) 18: (61,) 16: (61,) 17: (61,) 24: (61,) 20: (61,) 19: (61,) 23: (61,) 19: (61,) 19: (61,) 22: (61,) 22: (61,) 18: (61,) 18: (61,) 27: (61,) 30: (61,) 24: (61,) 27: (61,) 18: (61,) 26: (61,) 24: (61,) 29: (61,) 28: (61,) 26: (61,) 32: (61,) 25: (61,) 39: (61,) 34: (61,) 29: (61,) 32: (61,) 33: (61,) 33: (61,) 26: (61,) 37: (61,) 40: (61,) 37: (61,) 44: (61,) 50: (61,) 60: (61,) 59: (61,) 70: (61,)

In [50]:
it = 1

# obtain the translation results
csamples = _bpe2words(
        _seqs2words(statistics['sample'], word_idict_trg,
                    statistics['action'], 1))
csources =  _bpe2words(
        _seqs2words(statistics['SWord'], word_idict,
                    statistics['action'], 0))
sources  = _seqs2words(statistics['SWord'], word_idict)
samples  = _seqs2words(statistics['sample'], word_idict_trg)
targets  =  _bpe2words(
        _seqs2words(statistics['TWord'], word_idict_trg))

c  = 0
for j in range(20, 40):
    print '--Id: {}'.format(j)
    print 'source: ', csources[j]
    print 'sample: ', csamples[j]
    print 'source: ', sources[j]
    print 'sample: ', samples[j]
    print 'target: ', targets[j]
    print 'quality:', statistics['track'][j][0]
    print 'delay:',   statistics['track'][j][1]
    print 'reward:',  statistics['track'][j][2]


--Id: 20
source:  He noted that most of the participants in the symposium were of the view that host countries should be able to regulate FDI so that they could choose their own particular mix of policies and conditions relating to FDI keeping in mind their developmental needs
sample:   指出 大多数 成员  作者 多半   参与者
source:  He noted that most of the participants in the symposium were of the view that host countries should be able to regulate FDI so that they could choose their own particular mix of policies and conditions relating to FDI keeping in mind their developmental needs
sample:  他 指出 大多数 成员 的 作者 多半 都 是 参与者
target:  他 指出 大多数 与会者 认为 接受 投资 的 国家 应该 能够 管理 外国 直接 投资 以便 能 结合 自己 的 发展 需要 选择 有 自己 特点 的 外国 直接 投资 政策 和 条件 的 组合
quality: 0.0013957233579
delay: 0.740259730646
reward: 0.0027914467158
--Id: 21
source:  The Special Representative noted that the use of criminal charges such as UNK encouraging hatred of the State UNK and UNK distributing fal@@ se@@ hoods and r@@ um@@ ors UNK frequently implies the risk of sup@@ pressing legitimate free speech and is particularly worrying when such charges are raised against a person for having den@@ ounced alleged human rights violations
sample:  新@@ 纪@@  权利 特别 代表 支持  性别歧视 组织  代表
source:  The Special Representative noted that the use of criminal charges such as UNK encouraging hatred of the State UNK and UNK distributing fal@@ se@@ hoods and r@@ um@@ ors UNK frequently implies the risk of sup@@ pressing legitimate free speech and is particularly worrying when such charges are raised against a person for having den@@ ounced alleged human rights violations
sample:  新@@ 纪@@ 元 权利 特别 代表 支持 反 性别歧视 组织 的 代表
target:  35 特别 代表 注意 到 采用 诸如 鼓励 对 国家 的 仇恨 和 散布 谎言 和 UNK 言 等 指控 常常 意味着 压制 合法 自由言论 的 风险 在 针对 谴责 指称 侵犯 人权 事项 者 提出 这种 指控 时 尤其 令人担心
quality: 0.000254578728715
delay: 0.692307681657
reward: 0.00050915745743
--Id: 22
source:  178 The Working Group wishes to remind the Government of the Republic of the Congo of its responsibility to conduct thorough and impartial investigations UNK for as long as the fate of the victim of enforced disappearance remains un@@ clarified UNK in accordance with article 13 paragraph 6 of the Declaration
sample:  178 工作组 希望 工作组  祝愿 工作组 进一步 UNK
source:  178 The Working Group wishes to remind the Government of the Republic of the Congo of its responsibility to conduct thorough and impartial investigations UNK for as long as the fate of the victim of enforced disappearance remains un@@ clarified UNK in accordance with article 13 paragraph 6 of the Declaration
sample:  178 工作组 希望 工作组 还 祝愿 工作组 进一步 UNK
target:  178 工作组 谨 提请 刚果共和国 政府 注意 其所 承担 的 责任 即 按照 宣言 第 13 条 第 6 款 进行 彻底 和 公正 的 调查 直至 查明 被 强迫 失踪 的 人 的 命运 为止
quality: 0.000293064431893
delay: 0.722222214198
reward: 0.000586128863786
--Id: 23
source:  In information provided on the application of the Cartagena Action Plan on 13 September 2010 Ethiopia reported 13 areas in which UNK mines are known to be em@@ placed and 44 areas in which UNK mines are suspected to be em@@ placed
sample:  并且 涉及 资料  关于 适用 卡塔赫纳 土地 产品
source:  In information provided on the application of the Cartagena Action Plan on 13 September 2010 Ethiopia reported 13 areas in which UNK mines are known to be em@@ placed and 44 areas in which UNK mines are suspected to be em@@ placed
sample:  并且 涉及 资料 中 关于 适用 卡塔赫纳 土地 产品
target:  埃塞俄比亚 在 2010 年 9 月 13 日 关于 实施 卡塔赫纳 行动计划 的 资料 中 报告 了 13 个 已知 布设 了 杀伤 人员 地雷 的 区域 和 44 个 怀疑 布设 了 杀伤 人员 地雷 的 区域
quality: 0.000233381292852
delay: 0.699999992222
reward: 0.000466762585704
--Id: 24
source:  Direc@@ ting attacks against personnel installations material units or vehicles involved in a humanitarian assistance or peacekeeping mission in accordance with the United Nations Charter as long as they are entitled to the protection given to civilians or civilian objects under international humanitarian law
sample:  方向 指示  打击 攻击 办法 物资 单位  航线
source:  Direc@@ ting attacks against personnel installations material units or vehicles involved in a humanitarian assistance or peacekeeping mission in accordance with the United Nations Charter as long as they are entitled to the protection given to civilians or civilian objects under international humanitarian law
sample:  方向 指示 性 打击 攻击 办法 物资 单位 或 航线
target:  UNK 攻击 执行 人道主义 援助 行动 的 人员 设施 物资 单位 或 车辆 或 根据 联合国 宪章 设立 的 维持 和平 特派团 只要 后者 根据 国际 人道主义法 有权 享有 相同 于 平民 或 民用 物体 的 保护
quality: 0.00093558134552
delay: 0.676767669932
reward: 0.00187116269104
--Id: 25
source:  28 The overall reduced requirements were offset in part by additional requirements with respect to civilian personnel costs due mainly to higher actual deployment of international staff an actual average strength of 316 compared with the projected average strength of 280
sample:  28 28 depts 杂项收入 整体 支出 总额  调整
source:  28 The overall reduced requirements were offset in part by additional requirements with respect to civilian personnel costs due mainly to higher actual deployment of international staff an actual average strength of 316 compared with the projected average strength of 280
sample:  28 28 depts 杂项收入 整体 支出 总额 的 调整
target:  28 文职人员 所 需 资源 增加 主要 原因 是 实际 部署 的 国际 工作人员 增多 实际 平均 人数 为 316 人 而 预计 平均 人数 为 280 人 这部分 抵消 了 所 需 经费 减少 总额
quality: 0.000265020043788
delay: 0.372727269339
reward: 0.000530040087575
--Id: 26
source:  Direc@@ ting attacks against personnel installations material units or vehicles involved in a humanitarian assistance or peacekeeping mission in accordance with the United Nations Charter as long as they are entitled to the protection given to civilians or civilian objects under international humanitarian law
sample:  方向  袭击  手法 AP@@ s
source:  Direc@@ ting attacks against personnel installations material units or vehicles involved in a humanitarian assistance or peacekeeping mission in accordance with the United Nations Charter as long as they are entitled to the protection given to civilians or civilian objects under international humanitarian law
sample:  方向 同 袭击 为 手法 AP@@ s
target:  UNK 攻击 执行 人道主义 援助 行动 的 人员 设施 物资 单位 或 车辆 或 根据 联合国 宪章 设立 的 维持 和平 特派团 只要 后者 根据 国际 人道主义法 有权 享有 相同 于 平民 或 民用 物体 的 保护
quality: 0.0
delay: 0.567307686853
reward: 0.0
--Id: 27
source:  However the increase was offset to some extent by price declines in manufactured goods so that notwithstanding the improvement in some commodity prices on an overall basis developing economies of the ESCAP region suffered ter@@ m@@ s@@ of@@ trade losses during the year
sample:  但是 然而 增加  因果关系 至少 出现 
source:  However the increase was offset to some extent by price declines in manufactured goods so that notwithstanding the improvement in some commodity prices on an overall basis developing economies of the ESCAP region suffered ter@@ m@@ s@@ of@@ trade losses during the year
sample:  但是 然而 增加 与 因果关系 至少 出现 了
target:  但是 这一 增长 在 一定 程度 上 受到 了 制成品 价格 下跌 的 抵消 因此 尽管 某些 商品价格 出现 了 一些 起色 但 总的来说 亚太经社会 的 发展 中 经济体 在 贸易条件 方面 受到 了 损失
quality: 0.000165693344378
delay: 0.606837601651
reward: 0.000331386688757
--Id: 28
source:  b When the work of the Sixth Committee contains new topics or topics rarely covered by United Nations bodies OL@@ A should offer to prepare background documents on legal aspects of the topics that would facilitate their consideration by delegations in subsequent sessions
sample:  b   第二次  养育 
source:  b When the work of the Sixth Committee contains new topics or topics rarely covered by United Nations bodies OL@@ A should offer to prepare background documents on legal aspects of the topics that would facilitate their consideration by delegations in subsequent sessions
sample:  b 水 量 第二次 时 养育 至
target:  b 当 第六 委员会 的 工作 涉及 新 议题 或 联合国 机构 很少 审议 的 议题 时 法律 厅应 主动 编制 有关 这些 议题 涉及 的 法律 问题 的 背景 文件 以便 于 各国 代表团 在 今后 届会 中 对 其 进行 审议
quality: 3.43924054195e-06
delay: 0.374999996875
reward: 6.8784810839e-06
--Id: 29
source:  He noted that most of the participants in the symposium were of the view that host countries should be able to regulate FDI so that they could choose their own particular mix of policies and conditions relating to FDI keeping in mind their developmental needs
sample:   指出 大多数  多数 参与  专题讨论 
source:  He noted that most of the participants in the symposium were of the view that host countries should be able to regulate FDI so that they could choose their own particular mix of policies and conditions relating to FDI keeping in mind their developmental needs
sample:  他 指出 大多数 人 多数 参与 了 专题讨论 会
target:  他 指出 大多数 与会者 认为 接受 投资 的 国家 应该 能够 管理 外国 直接 投资 以便 能 结合 自己 的 发展 需要 选择 有 自己 特点 的 外国 直接 投资 政策 和 条件 的 组合
quality: 0.000648685642831
delay: 0.615384610651
reward: 0.00129737128566
--Id: 30
source:  b When the work of the Sixth Committee contains new topics or topics rarely covered by United Nations bodies OL@@ A should offer to prepare background documents on legal aspects of the topics that would facilitate their consideration by delegations in subsequent sessions
sample:  b 何时 发生 什么 日期  工作 当@@    6  规定
source:  b When the work of the Sixth Committee contains new topics or topics rarely covered by United Nations bodies OL@@ A should offer to prepare background documents on legal aspects of the topics that would facilitate their consideration by delegations in subsequent sessions
sample:  b 何时 发生 什么 日期 的 工作 当@@ 工 时 第 6 种 规定
target:  b 当 第六 委员会 的 工作 涉及 新 议题 或 联合国 机构 很少 审议 的 议题 时 法律 厅应 主动 编制 有关 这些 议题 涉及 的 法律 问题 的 背景 文件 以便 于 各国 代表团 在 今后 届会 中 对 其 进行 审议
quality: 0.0011956466482
delay: 0.570370366145
reward: 0.0023912932964
--Id: 31
source:  b When the work of the Sixth Committee contains new topics or topics rarely covered by United Nations bodies OL@@ A should offer to prepare background documents on legal aspects of the topics that would facilitate their consideration by delegations in subsequent sessions
sample:  b    问题   问题  采纳  除外
source:  b When the work of the Sixth Committee contains new topics or topics rarely covered by United Nations bodies OL@@ A should offer to prepare background documents on legal aspects of the topics that would facilitate their consideration by delegations in subsequent sessions
sample:  b 当 工 作 问题 时 由 问题 未 采纳 的 除外
target:  b 当 第六 委员会 的 工作 涉及 新 议题 或 联合国 机构 很少 审议 的 议题 时 法律 厅应 主动 编制 有关 这些 议题 涉及 的 法律 问题 的 背景 文件 以便 于 各国 代表团 在 今后 届会 中 对 其 进行 审议
quality: 0.000777633634171
delay: 0.708791204897
reward: 0.00155526726834
--Id: 32
source:  He noted that most of the participants in the symposium were of the view that host countries should be able to regulate FDI so that they could choose their own particular mix of policies and conditions relating to FDI keeping in mind their developmental needs
sample:   指出   指出 特别 报告员 强调指出 大多数 难民 参加  研讨会
source:  He noted that most of the participants in the symposium were of the view that host countries should be able to regulate FDI so that they could choose their own particular mix of policies and conditions relating to FDI keeping in mind their developmental needs
sample:  他 指出 了 他 指出 特别 报告员 强调指出 大多数 难民 参加 了 研讨会
target:  他 指出 大多数 与会者 认为 接受 投资 的 国家 应该 能够 管理 外国 直接 投资 以便 能 结合 自己 的 发展 需要 选择 有 自己 特点 的 外国 直接 投资 政策 和 条件 的 组合
quality: 0.0037585926841
delay: 0.373626371573
reward: 0.00751718536821
--Id: 33
source:  The Special Representative noted that the use of criminal charges such as UNK encouraging hatred of the State UNK and UNK distributing fal@@ se@@ hoods and r@@ um@@ ors UNK frequently implies the risk of sup@@ pressing legitimate free speech and is particularly worrying when such charges are raised against a person for having den@@ ounced alleged human rights violations
sample:  特别 代表  指出 特别 代表 注意  特别 代表  刑事诉讼法  援用 引发  酷刑
source:  The Special Representative noted that the use of criminal charges such as UNK encouraging hatred of the State UNK and UNK distributing fal@@ se@@ hoods and r@@ um@@ ors UNK frequently implies the risk of sup@@ pressing legitimate free speech and is particularly worrying when such charges are raised against a person for having den@@ ounced alleged human rights violations
sample:  特别 代表 也 指出 特别 代表 注意 到 特别 代表 对 刑事诉讼法 的 援用 引发 了 酷刑
target:  35 特别 代表 注意 到 采用 诸如 鼓励 对 国家 的 仇恨 和 散布 谎言 和 UNK 言 等 指控 常常 意味着 压制 合法 自由言论 的 风险 在 针对 谴责 指称 侵犯 人权 事项 者 提出 这种 指控 时 尤其 令人担心
quality: 0.013725493175
delay: 0.69696969345
reward: 0.02745098635
--Id: 34
source:  178 The Working Group wishes to remind the Government of the Republic of the Congo of its responsibility to conduct thorough and impartial investigations UNK for as long as the fate of the victim of enforced disappearance remains un@@ clarified UNK in accordance with article 13 paragraph 6 of the Declaration
sample:  178 甘@@  收入  储存 木@@ 新@@  未@@  政府
source:  178 The Working Group wishes to remind the Government of the Republic of the Congo of its responsibility to conduct thorough and impartial investigations UNK for as long as the fate of the victim of enforced disappearance remains un@@ clarified UNK in accordance with article 13 paragraph 6 of the Declaration
sample:  178 甘@@ 蔗 收入 和 储存 木@@ 新@@ 港 未@@ 果 政府
target:  178 工作组 谨 提请 刚果共和国 政府 注意 其所 承担 的 责任 即 按照 宣言 第 13 条 第 6 款 进行 彻底 和 公正 的 调查 直至 查明 被 强迫 失踪 的 人 的 命运 为止
quality: 0.000102370196888
delay: 0.596153843288
reward: 0.000204740393776
--Id: 35
source:  178 The Working Group wishes to remind the Government of the Republic of the Congo of its responsibility to conduct thorough and impartial investigations UNK for as long as the fate of the victim of enforced disappearance remains un@@ clarified UNK in accordance with article 13 paragraph 6 of the Declaration
sample:  178 工作 方面   下列 回复  提醒 该国 政府 注意 阿拉伯叙利亚共和国 政府
source:  178 The Working Group wishes to remind the Government of the Republic of the Congo of its responsibility to conduct thorough and impartial investigations UNK for as long as the fate of the victim of enforced disappearance remains un@@ clarified UNK in accordance with article 13 paragraph 6 of the Declaration
sample:  178 工作 方面 作 了 下列 回复 并 提醒 该国 政府 注意 阿拉伯叙利亚共和国 政府
target:  178 工作组 谨 提请 刚果共和国 政府 注意 其所 承担 的 责任 即 按照 宣言 第 13 条 第 6 款 进行 彻底 和 公正 的 调查 直至 查明 被 强迫 失踪 的 人 的 命运 为止
quality: 0.00463877078523
delay: 0.631111108306
reward: 0.00927754157046
--Id: 36
source:  The State party should promulg@@ ate legal provisions that recognize the right to conscientious objection to military service and establish an alternative to military service that is accessible to all conscientious objec@@ tors and is not punitive or discriminatory in terms of its nature cost or duration
sample:  缔约国 重申  立法 条文 应当 认明 承认 权利  法律 规定
source:  The State party should promulg@@ ate legal provisions that recognize the right to conscientious objection to military service and establish an alternative to military service that is accessible to all conscientious objec@@ tors and is not punitive or discriminatory in terms of its nature cost or duration
sample:  缔约国 重申 该 立法 条文 应当 认明 承认 权利 的 法律 规定
target:  缔约国 应 颁布 法规 承认 出于 良心 拒 服兵役 的 权利 并 制订 兵役 替代 措施 使 所有 出于 良心 拒 服兵役 者 都 可 享有 并且 在 性质 费用 或 持续时间 上 不 具有 惩罚性 或 歧视性
quality: 0.00140314598501
delay: 0.610859725743
reward: 0.00280629197002
--Id: 37
source:  178 The Working Group wishes to remind the Government of the Republic of the Congo of its responsibility to conduct thorough and impartial investigations UNK for as long as the fate of the victim of enforced disappearance remains un@@ clarified UNK in accordance with article 13 paragraph 6 of the Declaration
sample:  178 工作组  提醒 该国 政府 提醒  政府 警惕 刚果共和国 政府
source:  178 The Working Group wishes to remind the Government of the Republic of the Congo of its responsibility to conduct thorough and impartial investigations UNK for as long as the fate of the victim of enforced disappearance remains un@@ clarified UNK in accordance with article 13 paragraph 6 of the Declaration
sample:  178 工作组 愿 提醒 该国 政府 提醒 其 政府 警惕 刚果共和国 政府
target:  178 工作组 谨 提请 刚果共和国 政府 注意 其所 承担 的 责任 即 按照 宣言 第 13 条 第 6 款 进行 彻底 和 公正 的 调查 直至 查明 被 强迫 失踪 的 人 的 命运 为止
quality: 0.00263668211264
delay: 0.624434386315
reward: 0.00527336422528
--Id: 38
source:  In information provided on the application of the Cartagena Action Plan on 13 September 2010 Ethiopia reported 13 areas in which UNK mines are known to be em@@ placed and 44 areas in which UNK mines are suspected to be em@@ placed
sample:    提供  信息   提供  关于 卡塔赫纳 行动计划  请求
source:  In information provided on the application of the Cartagena Action Plan on 13 September 2010 Ethiopia reported 13 areas in which UNK mines are known to be em@@ placed and 44 areas in which UNK mines are suspected to be em@@ placed
sample:  在 所 提供 的 信息 中 都 提供 了 关于 卡塔赫纳 行动计划 的 请求
target:  埃塞俄比亚 在 2010 年 9 月 13 日 关于 实施 卡塔赫纳 行动计划 的 资料 中 报告 了 13 个 已知 布设 了 杀伤 人员 地雷 的 区域 和 44 个 怀疑 布设 了 杀伤 人员 地雷 的 区域
quality: 0.0065393408572
delay: 0.551111108662
reward: 0.0130786817144
--Id: 39
source:  15 There have been successful examples of special measures taken to mitigate the impact of economic crises on women and girls including maintaining necessary social sector expenditures and implementing social protection policies to ensure their rights to health care education and maternal health services
sample:  15 15  特别 措施 减轻 经济 危机  经济
source:  15 There have been successful examples of special measures taken to mitigate the impact of economic crises on women and girls including maintaining necessary social sector expenditures and implementing social protection policies to ensure their rights to health care education and maternal health services
sample:  15 15 有 特别 措施 减轻 经济 危机 对 经济
target:  15 在 采取 特别 措施 减轻 经济危机 对 妇女 和 女童 的 影响 方面 已有 成功 的 范例 包括 维持 必要 的 社会 部门 支出 和 执行 社会保障 政策 以 确保 她们 享受 保健 教育 和 产妇 保健 服务 的 权利
quality: 0.000447588666902
delay: 0.684210523042
reward: -0.0141048226662

In [14]:
def is_str(s):
    return isinstance(s, basestring)
print is_str(samples[j])
print samples[j]
print samples[j].decode('utf-8')


True
b 用以 加强 主席团 / 小组 提出 的 新 议题
b 用以 加强 主席团 / 小组 提出 的 新 议题

In [26]:
import matplotlib
# reload(matplotlib) 

# import matplotlib.font_manager as fm
from matplotlib import pyplot as plt
from matplotlib import rc
from matplotlib import rcParams

# #rc('font',**{'family':'WenQuanYi Micro Hei', 'weight': 'normal'})
# #rc('font', size=8)
# plot.rcParams['axes.unicode_minus']=False 
import copy
import seaborn as sns
import pandas as pd
sns.set(context="paper", font="monospace", style='whitegrid')

# rc('text', usetex=True)
# rc('text.latex',unicode=True)
# rc('text.latex',preamble='\usepackage[utf8]{inputenc}')
# rc('text.latex',preamble='\usepackage[russian]{babel}')
# rc('text.latex',preamble='\usepackage[german]{babel}')
# rc('text.latex',preamble='\usepackage[ngerman]{babel}')
myfont = matplotlib.font_manager.FontProperties(
    fname='./utils/msyh.ttf') 

matplotlib.rcParams['axes.unicode_minus'] = False 
matplotlib.rcParams['ytick.labelsize'] = 11
matplotlib.rcParams['xtick.labelsize'] = 11 

def heatmap3(sources, refs, trans, actions, idx, atten=None, full_atten=None, savefig=True, name='test', info=None, show=False):
    source = ['*'] + [s.strip() for s in sources[idx].decode('utf8').replace('@@', '--').split()] + ['||']
    target = ['*'] + [s.strip() for s in trans[idx].decode('utf8').replace('@@', '--').split()] + ['||'] + ['*']
    action = actions[idx]
   
    flag   = 0
    if atten:
        attention = numpy.array(atten[idx])
    else:
        attention = None

    if full_atten:
        fullatten = numpy.array(full_atten[idx])
    else:
        fullatten = None
    
    def track(acts, data, annote):
        x, y, z = 0, 0, 0
        for a in acts:
            x += (a == 1)
            y += (a == 0)
            z += (a == 2)

            # data[y + 1, x]   = 1
            # data[z, x + 1]   = 1
            # annote[y, x] = 'W' if a == 0  else 'C'

        return data, annote
    # print target
    
    data       = numpy.zeros((len(source), len(target)))
    annote     = numpy.chararray(data.shape, itemsize=8)
    annote[:]  = '' 
    data, annote  = track(action, data, annote)
    data[1, 0] = 1
    
    def draw(data_t, ax, attention=None):
        
        data   = copy.copy(data_t)
        data[1:-1, 1:-1] += attention.T
        d  = pd.DataFrame(data=data, columns=target, index=source)
        # p  = sns.diverging_palette(220, 10, as_cmap=True)
        g  = sns.heatmap(d, mask=(data==0), square=True, cbar=False, 
                         linewidths=0.1, ax=ax, annot=annote, fmt='s')
        g.xaxis.tick_top()
   
        for tick in ax.get_xticklabels():
            tick.set_rotation(60)
        for tick in ax.get_yticklabels():
            tick.set_rotation(0)
        for label in ax.get_xticklabels():
            label.set_fontproperties(myfont)
        
        ax.grid(True)
        
    if full_atten:
        f, [ax1, ax2] = plt.subplots(1, 2, figsize=(22, 11))
        f.set_canvas(plot.gcf().canvas)
    
        draw(data, ax1, attention)
        draw(data, ax2, fullatten)
    else:
        f, ax1 = plt.subplots(1, 1, figsize=(22, 22))
        f.set_canvas(plt.gcf().canvas)
    
        draw(data, ax1, attention)

    
    if savefig:
        if not os.path.exists('.images/M_{}'.format(name)):
            os.mkdir('.images/M_{}'.format(name))

        filename = 'Idx={}||'.format(info['index'])
        for w in info:
            if w is not 'index':
                filename += '.{}={:.2f}'.format(w, float(info[w]))

        # print 'saving...'
        plt.savefig('.images/M_{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=100)
    
    if show:
        plt.show()

    # print 'plotting done.'
    plt.close()

In [35]:
src = [sources[j]]
L   = len(src[0].split())
trs = [samples[j]]
act = [statistics['action'][j]]
att = [[a[:L] for a in statistics['attentions'][j]]]
print att[0][0].shape
heatmap3(src, None, trs, act, 0, att, None, name='test', info={'index': 'test'}, show=True, savefig=False)


(41,)

In [51]:
j = 25
src = [sources[j]]
L   = len(src[0].split())
trs = [samples[j]]
act = [statistics['action'][j]]
att = [[a[:L] for a in statistics['attentions'][j]]]
print att[0][0].shape
heatmap3(src, None, trs, act, 0, att, None, name='test', info={'index': 'test'}, show=True, savefig=False)


(41,)

In [15]:
# -*- coding: utf-8 -*-   
from pylab import *  
 
# t = arange(-5*pi, 5*pi, 0.01)  
# y = sin(t)/t  
# plt.plot(t, y)  
# plt.title(u'这里写的是中文',fontproperties=myfont) #指定字体  
# plt.xlabel(u'X坐标',fontproperties=myfont)  
# plt.ylabel(u'Y坐标',fontproperties=myfont)  
# plt.show()

In [ ]: