In [1]:
%matplotlib inline
import tensorflow as tf
from vahun.corpus import TrainXY_Corpus as Corpus
import numpy as np
from vahun.tools import Timer
from vahun.tools import explog
#from vahun.autoencoder import Autoencoder_ffnn
from vahun.tools import show_performance
from vahun.genetic import Settings
from vahun.tools import get_reconstruction
from vahun.Autoencoder_FFNN import Autoencoder_FFNN
from vahun.Autoencoder_Variational import Autoencoder_Variational
timer=Timer()
corpus_path='/mnt/store/velkey/mnsz2/webcorp.full.enfilt.segmented'
encode=800
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
corpus=Corpus(corpus_path=corpus_path,size=400000)
logger=explog(encoder_type="demo_autoencoder_segmented_"+str(encode),
encoding_dim=encode,
feature_len=20,
lang=corpus_path,
unique_words=len(corpus.wordlist),
name="demo_autoencoder_top_segmented_"+str(encode),
population_size=0,
words=len(corpus.wordlist),path='/tmp/')
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
In [2]:
encoder=Autoencoder_FFNN(
logger=logger,tf_session=sess,
inputdim=len(corpus.abc)*20,
layerlist=[encode,len(corpus.abc)*20],
encode_index=1,corpus=corpus,
optimizer =tf.train.AdamOptimizer(learning_rate = 0.001),
nonlinear=tf.sigmoid,disp_step=40,
charnum=len(corpus.abc))
encoder.train(corpus.x_train,corpus.x_valid,corpus.x_test,
512,40,
corpus.y_train,corpus.y_valid,corpus.y_test)
In [3]:
encoder.save("/mnt/store/velkey/graphs/auto800.graph")
In [3]:
Out[3]:
In [5]:
result=encoder.get_reconstruction_splitbrain(corpus.x_test,corpus,corpus.y_test)
In [6]:
with open('/mnt/store/velkey/Segmented_500_encoding', "a") as myfile:
for it in result:
string=""
for i in it:
string+=str(i)
string+='\t'
string+='\n'
myfile.write(string)
In [8]:
for i in range(30):
print(result[i][3],'\t',result[i][4])
In [10]:
std=show_performance(encoder,["eh","kecske","kutya","aytuk","macska","árvíztűrő","fúró","kacsa","a","és"],Xcorpus,printer=True,inputfsize=len(Xcorpus.abc))
In [ ]:
def decode_critical(lista,enc=180):
for POS in lista:
encoded=np.ones(enc)*-10
encoded[POS]=10
a=encoder.decode([encoded])[0].reshape([10,len(corpus.abc)])
b=corp.defeaturize_data_charlevel_onehot([a])
print(b)
In [ ]:
import heapq
topstd=heapq.nlargest(6, range(len(stds)), stds.__getitem__)
big_stuff=[]
for i in range(50):
big_stuff.append([])
for topind in topstd:
a=encoder.encode([corpus.x_test[i]])
a[0][topind]=2
b=encoder.decode([a[0]])[0].reshape([10,len(corpus.abc)])
c=corpus.defeaturize_data_charlevel_onehot([b])
big_stuff[i].append(c)
for row in big_stuff:
print(row)
In [ ]: