In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

In [8]:
path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('corpus length:', len(text))


corpus length: 600893

In [9]:
chars = sorted(list(set(text)))

In [11]:
print('total chars:', len(chars))


total chars: 57

In [12]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [15]:
maxlen = 40  # この長さのテキストに分割する
step = 3     # オーバーラップ
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])  # 入力となる長さ40の文字列
    next_chars.append(text[i + maxlen])    # 予測したい次の文字
print('num sequences:', len(sentences))


num sequences: 200285

In [20]:
len(sentences[0]), sentences[0]


Out[20]:
(40, 'preface\n\n\nsupposing that truth is a woma')

In [21]:
next_chars[0]


Out[21]:
'n'

In [28]:
print('Vectorization...')
# 入力は長さ maxlen の文字列なのでmaxlenが必要
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
# 出力は1文字しかないので maxlen は不要
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1  # 対象文字のみTrueとなるベクトルにする
    y[i, char_indices[next_chars[i]]] = 1


Vectorization...

In [30]:
print(X.shape, y.shape)


(200285, 40, 57) (200285, 57)

In [31]:
print(X[0][0])


[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False  True False False False False False
 False False False False False False False False False]

In [32]:
print(y[0])


[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False  True False False False False False False False
 False False False False False False False False False]

In [60]:
print('Build model...')
model = Sequential()

# LSTMの入力は (バッチ数, 入力シーケンスの長さ, 入力の次元) となる(バッチ数は省略)
# maxlenを変えてもパラメータ数は変化しない(各時刻でパラメータは共有するため)
# 128は内部の射影と出力の次元(同じになる)
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
# 出力の128次元にさらにFCをくっつけて文字ベクトルを出力
model.add(Dense(len(chars)))  # 出力
model.add(Activation('softmax'))


Build model...

In [61]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_9 (LSTM)                (None, 128)               95232     
_________________________________________________________________
dense_8 (Dense)              (None, 57)                7353      
_________________________________________________________________
activation_8 (Activation)    (None, 57)                0         
=================================================================
Total params: 102,585
Trainable params: 102,585
Non-trainable params: 0
_________________________________________________________________

In [62]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [73]:
# 200285個の長さ40の時系列データ(各データは57次元ベクトル)の意味
print(X.shape, y.shape)


(200285, 40, 57) (200285, 57)

In [75]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    # temperatureによって確率が変わる???
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    # 確率が最大のインデックスを返す
    return np.argmax(probas)

In [ ]:
for iteration in range(1, 60):
    print()
    print('-' * 50)
    print('Iteration', iteration)

    # 時系列データを入力して学習
#    model.fit(X, y, batch_size=128, epochs=1)
    
    # 学習データのランダムな位置の40文字に続く文字列を生成する
    start_index = random.randint(0, len(text) - maxlen - 1)

    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence

    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    # 400文字分生成する
    # この400文字を生成している間、LSTMに内部状態が保持されている?
    for i in range(400):
        x = np.zeros((1, maxlen, len(chars)))

        # sentenceを符号化
        # このsentenceは400回のループで生成された文字を加えて次送りされていく
        for t, char in enumerate(sentence):
            x[0, t, char_indices[char]] = 1.0

        # 57次元(文字数)の出力分布
        # (系列長=40, データ次元=57) を入力
        preds = model.predict(x, verbose=0)[0]

        # もっとも確率が高いのを選ぶのではなくサンプリングする?
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        generated += next_char

        # 入力は長さ40にしたいので今生成した文字を加えて1つ先に送る
        # このsentenceが次の入力となる
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()


--------------------------------------------------
Iteration 1

----- diversity: 0.2
----- Generating with seed: "now--it dawns upon men that
they have pr"
now--it dawns upon men that
they have prroaaooooanaaoahoooaaaaoaaaaoaooaaaantaooaaaaaaaoanoooaoaaaooaaoaooooaaanaoaaaaoaohaoaaaoaaaaoaaoaaoaaoaoaoonaahaooaoaaaaaaaooaaoaohacooaaoaohoooonaaaoaoaohooaooaoaaaohooaoooaaaoaorooaaoaaaanaaaaoaahooaaaaaaoaaaaoaoaonaaaaaoaaoaaoaaaoaooaoonnanoaaaaoaoahaalocnoaaaonaannoaanaaaaanaahaaaaooaaolaaohanaaaaaaaoaoaoaonoaoaaoaroaoooaaaoaaahoaaaaaoooaoaooaaonnaaaoaoaoaaahaaaaaaaaaooanaaaanaoaooaaonoaaaarna

----- diversity: 0.5
----- Generating with seed: "now--it dawns upon men that
they have pr"
now--it dawns upon men that
they have prooaarrahllcoaannnoatoaaaotaoooonanaoahonhhtaaoahonoonnnaraaanooonlnaolaotoataacacatoaehanooncaioloaaitoolatatahllranaaaoohahloonanaoloohohancnchnohaaalooaaaahotartalnoahonahrnnnonroaaaooaoooaonoanooaohrdaacodhcthnaococoaoahooroaohlncanotrollanaoaohooaoathnaaornaanaoadlnrhlcaoroacrcohntaaaoahoaanolocoaahhonootaaollanonhaaoactohchcnadnaordnlnattaaclrnrohnrnndhaaloonoatnonwaaaaoloonoononlnaaaaanrahat

----- diversity: 1.0
----- Generating with seed: "now--it dawns upon men that
they have pr"
now--it dawns upon men that
they have prlohrrcwohahc-hderaoyinaanodddtlhohcoancorncrhcooh sohodcrcc aeonaonhdoaarn-olatpryhrhtlnwhhronaalrdcoannhoorhdlonoitantaaahcaanrcolrlraaloatnrononoorunrrdhnnlchoo llearavihonrahoaadoitctcaoowchiucatvawlrtlahhccolahnatan lnorncaoannooaaa thrhoia rhnrcdlrnaaarhronhayctlnaahoaolcholh,rhlahdaolnroccoonoolohtcn nalndarhddnnioolooolroaialcaahartnarloaanlntonaitaocdnhtolawvwohnaianaoocoinlrniaalfnlnnarll

----- diversity: 1.2
----- Generating with seed: "now--it dawns upon men that
they have pr"
now--it dawns upon men that
they have propnawrhcaeoslthnohhncsnoi
ooldnnawd.taeonoaaa ahaortntchorrfhaadanaddt(atoct
nanho ohancirhnlatoroltnoa hcaohnidahtlilnotlollnalfaofaaotnwtblnnnaooholoallho-oosnnthloapdalcthhctclocthpdwtaaticolcnlehntcwto
aaahoaarlocnthcnt-rhctn 
 notondoiiosnhdnshristrtloraacodtcartwlrn onlotlniodooz odhcclsonrnnaontnnaaoah tpo dhilalenotdahohoaoanyhohnoat5ghnlaaoanlntonndatallaclocyonaosnc-landratrnacloah stadd

--------------------------------------------------
Iteration 2

----- diversity: 0.2
----- Generating with seed: "uation
is at the back of all their logic"
uation
is at the back of all their logicaooaaananaaohaaooaaoaaaaaaaaooaaaoahnoaaaanaaaaaoaoaaaaaaaoaoalnnaaaonoaoaaaanaoaaoaaanaaoaanooaaaaanoaaaaaoaooaaaanaanoaaaaaaaaaaaonaaaoaooaaoooaaooononaaaaaaaooaaaoaaaaaaaaaonlanaaacaaaoaardahaoaoaanooaaaaaaorooaaoaaaananaaonaooooaaaroanohnaaaaaaaaranoaaaaaoaaaoaoaanoalanoonoaoarnoaaaaahaoaaaaoaaaaaanaaaaaaoaaoaooanoooaoaaaooaoooroaaaaaaoaooaaaaoaaooaaaooaaaaaaoaaaaaonaaaooaonoaaoaoaooaanaanaano

----- diversity: 0.5
----- Generating with seed: "uation
is at the back of all their logic"
uation
is at the back of all their logicoalnaroahcnaonnaoraraal hoocaaarlonionocohnclhootlhtoctolaanroaccocaaooaoonoltorroaaalaroard oocdotrlorahandlnlrahcaochoaaaaldaoaaloco-oolonaalolharaaaoohaoahaannaaohaolacwn aionantaooooooaaaaoaoaocolannoaaooohohaonaacolctahlo lohaoodllonooahaoohtcnaohhooonohnaonnoalroaaoonttoonhotholooanracnoaahlcacooananrrolohtoorrohoaaaaonrorrolhhnaatoalntloodnohnraorooaooaaraoanraaaohhlooddnolaaarrlnrooraoaoah

----- diversity: 1.0
----- Generating with seed: "uation
is at the back of all their logic"
uation
is at the back of all their logicalohhtarioaharnnlhowhotcwaohrcnlaraocdnwoahiahanaoaihoattnnnawohataoennnronacoorganaolsalrdaonahannaarpananatcoydddlhavhnonhhnalooldtwhaoanooh natnloheoidshadwyahoaco  ootahcnorodhhhaadradlyoc hohnrnnrwocllaohdnoanadrnldhnrhloraolaonlnhwndrhoaaoncthlcrltonartoaalnhoahwanhoaothnrircc thnonsaacnhl cenhddolaraa noaachadooolg ylnlsodslaor dnlnnoccaalcopalcrnahrhranorrotilliasnldllaaa nhraaanhhcatnowlc

----- diversity: 1.2
----- Generating with seed: "uation
is at the back of all their logic"
uation
is at the back of all their logicohaoa iiydeloon atntclonwnlwniriaocatoaroolaccrttannhhnooaohonaritaloacdehlhrfoodlnsebnoohtdronaoahacwnnrorfcnnrdd  haonlnhcnhdtnl tahrotirwrchcdehahcsahrnhtdfnloonphotlla cnrcoaoaorcndhnrhlrth-nhdrnnaa0occehantaatonpodrr
hnchnvlhopoldhaaohptstnonhaoehhcatoaconsoohaoddnodnoanaacaan-narlo,oaiylaahctoyodcaoreaoiniaanaodthsohoncladnoo
nrrlhohs  coailindhtnhdtaatpcnclwotoalrocracanrh caoactal?lcaononn

--------------------------------------------------
Iteration 3

----- diversity: 0.2
----- Generating with seed: "t it is difficult to preach this
moralit"
t it is difficult to preach this
moralitoooaaooooaaaaoaoaaooooooalaaoraonaaanaaanooanaaaanoaoaaaaaoaaaooaaroaaaoaoaaaaoaaannaolaaaaoaoaaoaaaaaaoaaoaaaaaoanoooolaaooaaahaaooanaaoaonoaoalaoaoaaaooaaaaoalaoooaaaooaacraaaooaooaaooaaanaaaaaooaaoaoooaaaaaonoaaaaraaohnnaaahraaaaoaaaaaaaaroaaooaooaraoaoaaaoaooonaaaoaooooaoaooooaonaoaaaalaanaaooaooaoaoahoaaaaaaaaaaooooaaanaaaaaalonoaaaonaoonaaloaaaaaanaaaaoooaoaonaaaanaaoaoaoaaaaaaaoaonanoononao

----- diversity: 0.5
----- Generating with seed: "t it is difficult to preach this
moralit"
t it is difficult to preach this
moralitoaonaaaanotnhnanoohuhhactolaraaahcroacaattoannaonhhaaloodaaatonraloooaraanooooraooalaoataaonhadarooroahononaoancarhhnaoaoarooroanohnccadcathoatocoaarannonaaoactnaoanacaaadnolneaaaolohahoooaooalnnlhanaohooooarnaooalnnnoronaaddanaoharcohaaanwaoaarainotooochdthaoadahloallanacoclhorooooho noohrnrlrnaochnnaonraooaoanaothaonotrhtwacohnaaoaaatoooraaaoonoiaotnoarraoooanaooaadrnhodthaaaaocontoaraonnehronnl

----- diversity: 1.0
----- Generating with seed: "t it is difficult to preach this
moralit"
t it is difficult to preach this
moralitralanlhoy edlaaashtochaoraachahhlnldnarnooocrohwolcioocciaaowcoltiho aachdliloalcwaclooaoalaniodsaoddwioladhhnfaolldtacahohnpatrntrachadahttoaahrdanpoatlaahoolyannasanrdlaeoahdrdtveohrhaoolpehevohrlds nntnadoronoaooclhnoahdrtonlcnacrinlchhohn hlolohtlaropoobaochdc aonhodapwnnasrohinaalhacatnaacl
rrooooncrnolhaelacocaalsacnndehiaeannacconl aoatno adonaooooooonnocncslarnanhatlcoai crhcvrrtahantodioa

----- diversity: 1.2
----- Generating with seed: "t it is difficult to preach this
moralit"
t it is difficult to preach this
moralitiadncctdwrncrrdrnhrolociocaaiaalcadiaarcccavdh-ooihhilnaonactoonnvddaaaoalacwcalrlyhtholllhoadhdnd p1nrannlci?adn ndoaalcla dclnoreonhthoyraodtaononhlfnacanhoo nlodcaahhhoociroaaidoohalwlrtoaolocohatl hwairaortaa c.cayaorridtcrocgntllwlaolaaloldorhaniplooflyonptlhrhclnaonayluttnawaaaacycaidlyohlonahnonaltehrsntccarlhnarhlnawnaonaallralottapaasorcc,thearrsldlswodohlanntonsaglcaaha laplorwéonorcaahr

--------------------------------------------------
Iteration 4

----- diversity: 0.2
----- Generating with seed: "ns of transition, for the sake of lighte"
ns of transition, for the sake of lighteaoaaaalaaoaaoaanoaoaanoaaaaaaaaaooaoaaaoaaaaaoaanaaaaanaoooaoaaooaooaoohoooaaaaaooaaaoaaooaraooaaaoolonoohaaaaaaoonoaaaaooanaaaacaaaooaoaaaaooohnoaaanaaaaoaaaaaaaaaaooaaaaaaoooaaoaaaaoaoaaatonooorahoaoaaoooraaaaaaaaoooaaoaaononoaaaaaaanaaoaaaaaanaaaaooaaaoaanconoconaaaaaooaooaoonnaoaaaoaooooaoanaaoaaaanoaaaaoaaaaanoohoaooaoloaooaaaaaoaoloonnaaaaaaanaooooanaaooaoooaanaaaoaaaaaooaoooaaaaoaacaaaraoaa

----- diversity: 0.5
----- Generating with seed: "ns of transition, for the sake of lighte"
ns of transition, for the sake of lightecatahacaonaoaaathaoaclnaaladahccroacnoaro naonalrnaonononrrnclhaaranallranalaoocnordcdrnonoontacoaooinidloaohohdahrhaaahololarahroraoaaaaaaotaoaaacltlaallhoolctrhooaoinhcrlcnhoaacaonhnodocoronlaoaoahaaaaanoohdaonnhoncadnaoaoolnhlantoarcaaaarlaoalannnhlaoeoaoaoahararonawooonnn ohaadnlnoonanddoaoaoaaohnaalnahoaoohlodaaoaaaanantaaaooorooaaccoaltoatcaloorltorhailalodoacanonaataotlrahoolrnarnalnrhatrro

----- diversity: 1.0
----- Generating with seed: "ns of transition, for the sake of lighte"
ns of transition, for the sake of lighteubcahorattacdaadocronwhtihwdtrdacdsghaahcnaldccahannoatahaehodonondrrchrtupncranrirdnorotnvonl hlconaaartcltrloatnatcnpacohtroannrraayahaahthcdnotdco8nlsaweadnictthacrororwohnoarodlntlanaorhpcnrlododlorovnohcclodaaocloanhaaaaonhturoroh drniawaorhhotaaahhcooantaaronalaawolclrtnhdcwtiarnhooanlrrnthenl tttthtnracacatohtcaadrhaoainhoaanadoarocscaarohrn rannandcnnothhaaonoaolccoa eocnahtchoooalcdnsnhwo

----- diversity: 1.2
----- Generating with seed: "ns of transition, for the sake of lighte"
ns of transition, for the sake of lighteyldhoiddrnlybnnnooa- hlhadtrlols raamohwnahnndnoccnnewraohwcwo hohfeodhlntawrdolnhrahnahd
yoaspacwwcncirolrrnannocrturloocahhr ydahlcairholnca-ni honovtawnatclrhvio-adorrc irrlnrhdaandha -hltcohllaatrccooctonrchrdlcohrtanowtthofhirayarnehlahaninatltortoooblhaaoosrcdddaraorahrrargrrnooctront toaooawioaorodho rnohnnclincaoxyanioiolrccigroooolaotahwa aanaieohchthaaarnwc hrhlaroihhcoc crhaarhnttco rhn

--------------------------------------------------
Iteration 5

----- diversity: 0.2
----- Generating with seed: "rateness,
moderation and the like come, "
rateness,
moderation and the like come, ooaaanoaaaaaaooaaaaoonaooohaaaoaaaraoahaaooaooootaaooooooaoaaooaaaaoaaaoaaaaaoaaaaaaoaooaaananaaaaaaoaaooaoooanaocoaaaaoooaaoahoaaoaoooaoaaaaaaoaaaaaanoonoaoooooaaoaoaoaonoaaaaaaaoroaaaaaoaaaannonoloaoaaaannaoaaahaaaanaoooaoooooooaoaaooanhaoooaanaaoaaaaanaaaaoaooonoahnaaaaaaoaoooaoahaaaoarooaaaaannnnaooooaaaooaoaaaoooaaaanoaahaoaoaooaoaoaoaaaaoahornaonoroaaaaaoahaaaaaaooononoonaaaaaaaoaaonaaoaooaa

----- diversity: 0.5
----- Generating with seed: "rateness,
moderation and the like come, "
rateness,
moderation and the like come, noaoooaatcaraahaiatooarnharnhaaoaaohonodnarooahaalalhoaaaooahnaoooaloooooadaocadnaaoltrohnnaonaratlorncoaoaorlnnoaodaanadnrorrncocrohhchnoahaaoanaahondihochaaohaonorotraa asoarnnltnconanooatoaaoalohlooooccooaanlndrhocnaootohacaalaaancttanaroarnhtanaannaaaoalaaaralarhaonnooadlaacanaaoooaonnonoornatdaaancoaaahonnaaaoooaanoanoaaahonnaaaoaaaarcrroanhnwron oootonohaooarconohdoddnaotoalooloonhdaaaanaaha

----- diversity: 1.0
----- Generating with seed: "rateness,
moderation and the like come, "
rateness,
moderation and the like come, an
dwooahnalataccotnadoonccdslloncaltla olsdranoradcathndoncannraooalddapyhnltaahorohaiaanhorolainrrnnalhacanonnioahrtnhlannoowtoctdarah cocochlcddfnanawaneowalolhdotanhdthaaanraansaoloar dallaowhitlalohaoaa olccaolhcln asoagrl-aacdlnrtinanaiapw hhcdacodnatnraaopcaaoaaraccoa ntaraondattndlaooawnodnpatortwaoontahelnihhaorhchaiaacharooodaroccnnollaetrhloo o ahahrooarntawwrcohonaonnaohnhaao
oatlnoclr

----- diversity: 1.2
----- Generating with seed: "rateness,
moderation and the like come, "
rateness,
moderation and the like come, chnngironchhnahlloawcordaldaanhc.aarcrwtonaotnlcaath
rr arlntpiloalt aloahnclawenaoaahnoratctlsnocnrnooohil-ttwoaaonohohahdcinoohaihghanaoiaocerfdonhhothd raooonwlhc a rrthatvostancoorohnliawoeoahranodoaa-lhf ahotaacow arlroanvaohha-ol l chd
rttirarnhditaahrrtooryor0pcao,nootl  lo ynco odlgtapainaiasarnnic aa oishl owiocdntyiahlatcoalaroonchvenacdootawallddalhanniccphdrntrvhrstaoroaastachocoteelon

--------------------------------------------------
Iteration 6

----- diversity: 0.2
----- Generating with seed: "ount to noble and servile, master and sl"
ount to noble and servile, master and slaanaoanaaaaananaaaoaahoaaaoaoaaoaaaanoohaoahoaonooaooroaaaaaoronoaoooaaanaooaaoooaaaanrhhooaaaahaoonoooaoaalaaaoaaraoahnaocaoaaaoonaooloaoaaoaaaaaaoaanoonaooaaaaoaoaaaaaaoaaoaaaaaaooaaaaaaoaaaaooaahaaaaoaaaoaaooaonooaaaaoaoaacaaaaaaaaoranraaaaoahaaaooaaaoaaaaaaaaaooaaooaooooanooaanoaoaaaaaaaalaaoaaaaaaooooaoooaaooaoanaooaaaoaaaaaoaraaaahaaoaahaoaaaaaooaoaooaaaaaaaaaoaooaaooooaaoooaaoaaanaoaoaoraaa

----- diversity: 0.5
----- Generating with seed: "ount to noble and servile, master and sl"
ount to noble and servile, master and slaoalhanaatoaannlcootalnaoaharhah

In [ ]: