In [1]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from six.moves import cPickle as pickle

# fix random seed for reproducibility
numpy.random.seed(7)


Using TensorFlow backend.

In [2]:
#from nltk import FreqDist, word_tokenize
import nltk
nltk.download('punkt') # download punkt
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[2]:
True

In [3]:
with open("Atmosfera-Incidents-2017.pickle", 'rb') as f:
    incidents = pickle.load(f)

In [4]:
# pick only the  summary and description_plain_text columns
print incidents[0,[1,7]]
texts= incidents[1:,[1,7]]
# Join to one text and lowercase
txt=[i for l in texts for i in l if i != None]
txt=" ".join(txt[:]).lower().replace('/',' ').replace('_',' ')


[u'summary' u'description_plain_text']

In [5]:
# and tokenize in polish
txt=nltk.word_tokenize(txt,language="polish")

In [6]:
#stopset = set(nltk.corpus.stopwords.words('polish'))

import many_stop_words
stopset=many_stop_words.get_stop_words('pl')

# remove polish stopwords
txt = [w for w in txt if not w in stopset]

In [7]:
# obtain frequency distribution
fd=nltk.FreqDist(txt)

In [8]:
# Remove additional rubbish 1 and 2 letter words 
morestops=[i[0] for i in fd.most_common(500) if len(i[0])<3]
txt = [w for w in txt if not w in morestops]

In [9]:
# obtain frequency distribution
fd=nltk.FreqDist(txt)

In [ ]:
#fd.most_common(50)
#fd.freq('data')
fd

In [10]:
# fd.most_common(50)
# Zapekluj słownik do pliku na później
with open("FreqDict.pickle", 'wb') as f:
    pickle.dump(fd, f, pickle.HIGHEST_PROTOCOL)

In [ ]:


In [ ]: