In [1]:
from six.moves import cPickle as pickle
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
import numpy as np

# Requires patch from my git
# pip install --upgrade git+https://github.com/emsi/keras
# https://github.com/fchollet/keras/pull/5747
# @@ -14,7 +14,7 @@
#  import warnings
# 
#  if sys.version_info < (3,):
# -    maketrans = string.maketrans
# +    maketrans = lambda intab, outtab: string.maketrans(intab, outtab).decode('latin-1')
#  else:
#      maketrans = str.maketrans


Using TensorFlow backend.

In [2]:
# Read data
with open("Atmosfera-Incidents-2017.pickle", 'rb') as f:
    incidents = pickle.load(f)

In [3]:
# pick only the  summary and description_plain_text columns
print incidents[0,[1,7]]
texts=incidents[1:,[1,7]]


[u'summary' u'description_plain_text']

In [4]:
# Join to list of texts
txt=[i for l in texts for i in l if i != None]
#txt=" ".join(txt[:]).lower().replace('/',' ').replace('_',' ')

In [5]:
# Tokenize
tknzr = Tokenizer(lower=True, split=" ")
tknzr.fit_on_texts(txt)
# vocabulary:
#print(tknzr.word_index)

In [6]:
# making sequences:
X_train = [[i if (i is not None) else "" for i in l ]for l in texts]
X_train = [". ".join(l) for l in X_train]
X_train = np.asarray(tknzr.texts_to_sequences(X_train))

print X_train.shape


(22450,)

In [ ]:
print (texts[0])
print(X_train[0])

word_dict = {idx: word for word, idx in tknzr.word_index.items()}
sample = []
for idx in X_train[0]:
        sample.append(word_dict[idx])
print (' '.join(sample))

In [ ]:
# Pickle
with open("X-sequences.pickle", 'wb') as f:
    pickle.dump(X_train, f, pickle.HIGHEST_PROTOCOL)

In [ ]:
# Skonwertuj root_service do intów i zapisz 
Y=[int(i) for i in incidents[1:,3]]
with open("Y.pickle", 'wb') as f:
    pickle.dump(Y, f, pickle.HIGHEST_PROTOCOL)

In [ ]:


In [ ]:


In [ ]: