In [1]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from six.moves import cPickle as pickle

# fix random seed for reproducibility
numpy.random.seed(7)


Using TensorFlow backend.

In [2]:
# Dane wejściowe
with open("Atmosfera-Incidents-2017.pickle", 'rb') as f:
    incidents = pickle.load(f)

In [3]:
# Słownik
with open("FreqDict.pickle", 'rb') as f:
    fd=pickle.load(f)

In [4]:
# Just take first 10 000 most common words
common=10000
fd= dict((x, y) for x, y in fd.most_common(common))

In [5]:
# join summary and description_plain_text columns, drop header
X=[[i for i in l if i != None] for l in incidents[1:,[1,7]]]
X=[" ".join(l).lower().replace('/',' ').replace('_',' ') for l in X]

In [6]:
# Translate words to numbers using frequency distribution
X=[[fd[w] if (w in fd) else 0 for w in l.split()] for l in X]

In [7]:
# Zapekluj słownik do pliku na później
with open("X.pickle", 'wb') as f:
    pickle.dump(X, f, pickle.HIGHEST_PROTOCOL)

In [8]:
# Skonwertuj incident_class do intów i zapisz 
Y=[int(i) for i in incidents[1:,2]]
with open("Y.pickle", 'wb') as f:
    pickle.dump(Y, f, pickle.HIGHEST_PROTOCOL)

In [9]:



Out[9]:
array([u'[TV] Timeouty dla us\u0142ugi VasValidator',
       u'Dobry wiecz\xf3r, od godziny 02:13 obserwujemy du\u017c\u0105 ilo\u015b\u0107 timeout\xf3w dla us\u0142ugi_x000D_\nVasValidator. Prosz\u0119 o weryfikacje._x000D_\n--_x000D_\nINSYS Support_x000D_\n'], dtype=object)

In [10]:



Out[10]:
2814L

In [ ]: