In [4]:
%matplotlib inline
import tensorflow as tf
from vahun.corpus import TSV_Corpus as Corpus
import numpy as np
from vahun.tools import Timer
from vahun.tools import explog
#from vahun.autoencoder import Autoencoder_ffnn
from vahun.tools import show_performance
from vahun.genetic import Settings
from vahun.tools import get_reconstruction
from vahun.Autoencoder_FFNN import Autoencoder_FFNN
from vahun.Autoencoder_Variational import Autoencoder_Variational
timer=Timer()
size=400000
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
In [1]:
In [1]:
from vahun.corpus import TrainXY_Corpus
In [2]:
arg=TrainXY_Corpus('/mnt/store/velkey/mnsz2/webcorp.full.enfilt.segmented',size=400000)
In [3]:
Out[3]:
In [24]:
import random
def levennoise(corpus,word,dist=2):
if len(word)>18:
return word
for i in range(dist):
a=random.random()
if a <= 0.333:
#del
r=random.randint(0,len(word))
word=word[:r]+word[r+1:]
if a>0.333 and a<=0.666:
#append
r=random.randint(0,len(word))
x=random.randint(0,len(corpus.abc))-1
ch=list(corpus.abc)[x]
word=word[:r]+ch+word[r:]
if a>0.666:
#change
r=random.randint(0,len(word))
x=random.randint(0,len(corpus.abc))-1
ch=list(corpus.abc)[x]
word=word[:r]+ch+word[r+1:]
return word
def levenshtein_noisify(corpus):
wordlist=[levennoise(corpus,word) for word in corpus.wordlist]
return wordlist,corpus.featurize_data_charlevel_onehot(wordlist)
In [27]:
levenwords,X_leven=levenshtein_noisify(corpuses[0])
In [ ]:
exps = []
ranger=range(10,20)
i=0
with open('/mnt/store/velkey/experiments') as f:
for line in f:
if(i in ranger):
exps.append(line.strip().split('\t'))
i+=1
for exper in exps:
exper=[int(item) for item in exper]
layerlist=exper[3:]
settings=Settings(layerlist)
typ=0
if exper[1]==0 and exper[2]==0:
corpus_path='/mnt/store/velkey/mnsz2/filt.200k.maxlen20'
typ=0
if exper[1]==1 and exper[2]==0:
corpus_path='/mnt/store/velkey/mnsz2/filt.200k_random.maxlen20'
typ=1
if exper[1]==0 and exper[2]==1:
corpus_path='/mnt/store/velkey/mnsz2/filt.200k.maxlen20.digraph_repl'
typ=2
if exper[1]==1 and exper[2]==1:
corpus_path='/mnt/store/velkey/mnsz2/filt.200k_random.maxlen20.digraph_repl'
typ=3
corpus=corpuses[typ]
name=(str("uniq_"+("variational_" if exper[0]==1 else "autoencoder_")+
("top_" if exper[1]==1 else "random_")+
("bigraph_" if exper[2]==1 else "uni_")))
logger=explog(encoder_type=name,
encoding_dim=0,
feature_len=20,
lang=corpus_path,
unique_words=len(set(corpus.wordlist)),
name=name,
population_size=0,
words=len(corpus.wordlist))
for k in range(2):
print("starting experiment: ",exper)
timer.add("experiment")
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
if(exper[0]==1):
encoder=Variational_autoencoder(logger=logger,
tf_session=sess,
inputdim=len(corpus.abc)*20,
encoding_size=settings.weights[0],
corpus=corpus,
optimizer =tf.train.AdamOptimizer(learning_rate = 0.001),
nonlinear=tf.sigmoid,charnum=len(corpus.abc))
else:
encoder=Autoencoder_ffnn(
experiment=settings,
logger = logger,
tf_session=sess,
inputdim = len(corpus.abc)*20,
layerlist = settings.weights,
encode_index = int(len(settings.weights)/2),
corpus = corpus,
optimizer = tf.train.AdamOptimizer(learning_rate = 0.001),
nonlinear = tf.sigmoid,
charnum=len(corpus.abc))
encoder.train(corpus.x_train,corpus.x_valid,corpus.x_test,512,80)
print("Finished in:", timer.get("experiment") ,"s")
In [ ]: