In [1]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from six.moves import cPickle as pickle
# fix random seed for reproducibility
numpy.random.seed(7)
In [2]:
# Dane wejściowe
with open("Atmosfera-Incidents-2017.pickle", 'rb') as f:
incidents = pickle.load(f)
In [3]:
# Słownik
with open("FreqDict.pickle", 'rb') as f:
fd=pickle.load(f)
In [4]:
# Just take first 10 000 most common words
common=10000
fd= dict((x, y) for x, y in fd.most_common(common))
In [5]:
# join summary and description_plain_text columns, drop header
X=[[i for i in l if i != None] for l in incidents[1:,[1,7]]]
X=[" ".join(l).lower().replace('/',' ').replace('_',' ') for l in X]
In [6]:
# Translate words to numbers using frequency distribution
X=[[fd[w] if (w in fd) else 0 for w in l.split()] for l in X]
In [7]:
# Zapekluj słownik do pliku na później
with open("X.pickle", 'wb') as f:
pickle.dump(X, f, pickle.HIGHEST_PROTOCOL)
In [8]:
# Skonwertuj incident_class do intów i zapisz
Y=[int(i) for i in incidents[1:,2]]
with open("Y.pickle", 'wb') as f:
pickle.dump(Y, f, pickle.HIGHEST_PROTOCOL)
In [9]:
Out[9]:
In [10]:
Out[10]:
In [ ]: