Распоковка датасета
In [1]:
!unzip -u dataset.zip
print('Success')
In [2]:
!zip -r datasetCSV.zip datasetHabrahabr.csv
Импорт пакетов
In [1]:
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, TimeDistributed, Bidirectional
from keras.layers import LSTM, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten
from keras.layers.core import Dropout
from keras.callbacks import EarlyStopping
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import keras
import codecs
import os
%matplotlib inline
Загрузка готового набора данных
In [2]:
data = pd.read_csv('datasetHabrahabr.csv')
data.head()
Out[2]:
Загрузка тестовых документов
In [3]:
def get_dataset_from_files():
#path = 'D:\Разработка\DataScience\Habrahabr'
path = 'Habrahabr/'
files = os.listdir(path)
data_frame = pd.DataFrame()
for file_name in files:
file_obj = codecs.open(path + file_name, "r", "utf_8_sig" )
file_temp = file_obj.read()
url = file_temp[file_temp.find('url:') + 5:file_temp.find('title:')].rstrip()
title = file_temp[file_temp.find('title:') + 7:file_temp.find('text:')].rstrip()
text = file_temp[file_temp.find('text:') + 5:file_temp.find('author:')].rstrip()
author = file_temp[file_temp.find('author:') + 8:].rstrip()
row = pd.Series([url, title, text, author], index=['Url', 'Title', 'Text', 'Author'])
data_frame = data_frame.append(row, ignore_index=True)
file_obj.close()
return data_frame
In [4]:
#data = get_dataset_from_files()
Подсчет количества слов в каждой статье
In [3]:
data['CountWords'] = data['Text'].map(lambda x: len(x.split()))
print('Количество статей в корпусе:', len(data))
data.head()
Out[3]:
In [4]:
data.CountWords.plot(kind='bar', figsize=(15, 5), title="Number of worlds in texts");
Количество статей у каждого пользователя
In [4]:
num_classes = 29
In [5]:
author_count_news = data.Author.value_counts()[:num_classes]
author_count_news.plot(kind='bar', figsize=(15, 5), title="Number of author's articles");
In [6]:
temp_data = pd.DataFrame()
names_author = author_count_news.index.values
for author in names_author:
temp_data = temp_data.append(data[data.Author == author])
data = temp_data
print('Количество статей после удаления:', len(data))
In [20]:
from nltk.corpus import stopwords
stop = stopwords.words('russian')
data['Text'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))
print('Stop words have been deleted')
In [22]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
data['TextLem'] = data['Text'].map(lambda x: ' '.join([morph.parse(word)[0].normal_form for word in x.split()]))
print('The lemmatization completed')
In [7]:
names = data.Author.value_counts().index.values
lableEnc = LabelEncoder()
lableEnc.fit(names.ravel())
lables = lableEnc.transform(names).reshape((num_classes, 1))
oneHotEnc = OneHotEncoder()
oneHotEnc.fit(lables)
#lableEnc.fit(names_author.ravel())
#lables = lableEnc.transform(names_author).reshape((num_classes, 1))
#oneHotEnc.fit(lables)
# Example encoding
#aaa = lableEnc.transform(['@saul'])
#vvv = oneHotEnc.transform(aaa).toarray()
#print(vvv)
Out[7]:
In [8]:
for author in names:
val = lableEnc.transform([author])[0]
data.Author.replace(to_replace=author, value=val, inplace=True)
#data = data.drop(['Url', 'Title', 'CountWords'], axis=1)
data.head()
Out[8]:
Сохранение датасета
In [28]:
filename = 'datasetHabrahabr.csv'
data.to_csv(filename, index=False, encoding='utf-8')
Перемешивание набора данных
In [9]:
data = data.sample(frac=1).reset_index(drop=True)
Токенизация текста
In [10]:
def get_texts_to_matrix(texts, max_features = 0):
tokenizer = Tokenizer(split=" ", lower=True)
if max_features != 0:
tokenizer = Tokenizer(split=" ", lower=True, num_words=max_features)
tokenizer.fit_on_texts(texts)
matrix_tfidf = tokenizer.texts_to_matrix(texts=texts, mode='tfidf')
print('Количество текстов:', matrix_tfidf.shape[0])
print('Количество токенов:', matrix_tfidf.shape[1])
return matrix_tfidf
In [11]:
def get_texts_to_sequences(text):
# создаем единый словарь (слово -> число) для преобразования
tokenizer = Tokenizer(split=" ", lower=True)
tokenizer.fit_on_texts(text)
# Преобразуем все описания в числовые последовательности, заменяя слова на числа по словарю.
text_sequences = tokenizer.texts_to_sequences(text)
print('В словаре {} слов'.format(len(tokenizer.word_index)))
return text_sequences
In [12]:
def get_texts_to_gramm_sequences(texts, count_gramm = 3):
gramms = {}
counter_gramm = 0
result = []
temp_vector = []
for text in texts:
for i in range(len(text) - count_gramm - 1):
gramm = text[i : i + count_gramm]
if gramms.get(gramm) == None:
gramms[gramm] = counter_gramm
counter_gramm += 1
temp_vector.append(gramms[gramm])
result.append(temp_vector)
temp_vector = []
print('Количество грамм в корпусе:', len(gramms))
#count_gramm = [len(x) for x in text_threegramm]
#num = np.array(count_gramm)
#num.mean()
return result
In [13]:
#X = get_texts_to_matrix(data['Text'])
X = get_texts_to_gramm_sequences(data['Text'], count_gramm=4)
#X = get_texts_to_sequences(data['Text'])
In [14]:
means = [len(x) for x in X]
plt.plot(means)
Out[14]:
In [15]:
def get_X_y_for_traning(X, y, num_words):
#tokenizer = Tokenizer(num_words=num_words)
#X = tokenizer.sequences_to_matrix(X_train, mode='binary')
X = keras.preprocessing.sequence.pad_sequences(X, maxlen=num_words)
y = keras.utils.to_categorical(y, num_classes)
print('Размерность X:', X.shape)
print('Размерность y:', y.shape)
return X, y
In [16]:
# Максимальное количество слов в самом длинном тексте
num_words = 30000
X_full, y_full = get_X_y_for_traning(X, data.Author, num_words)
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)
print('Testing set size:', len(X_test))
print('Training set size:', len(X_train))
In [17]:
def get_lstm_model():
model = Sequential()
model.add(Embedding(270000, 300))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(num_classes, activation="sigmoid"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_conv_model():
model = Sequential()
model.add(Embedding(863374, 200))
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(filters=512, kernel_size=3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(num_classes, activation="sigmoid"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_conv_conv_model():
model = Sequential()
model.add(Embedding(270000, 300))
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(filters=512, kernel_size=3, activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(num_classes, activation="sigmoid"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_conv_lstm_model():
model = Sequential()
#model.add(Dense(102562, activation='relu', input_shape=(8664, 600)))
model.add(Embedding(270000, 200))
model.add(SpatialDropout1D(0.3))
#model.add(TimeDistributed(Conv1D(filters=512, kernel_size=3, activation='relu')))
#model.add(TimeDistributed(GlobalMaxPooling1D()))
#model.add(TimeDistributed(Flatten()))
model.add(Conv1D(filters=512, kernel_size=3, activation='relu'))
model.add(MaxPooling1D())
#model.add(Flatten())
model.add(LSTM(50, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(num_classes, activation="sigmoid"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_lstm_conv_model():
model = Sequential()
model.add(Embedding(270000, 300))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(50, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
#model.add(SpatialDropout1D(0.2))
model.add(Conv1D(filters=512, kernel_size=3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(num_classes, activation="sigmoid"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
In [18]:
%%time
#model = get_lstm_model()
model = get_conv_model()
#model = get_conv_conv_model()
#model = get_conv_lstm_model()
model.summary()
BATCH_SIZE = 16
EPOCHS = 10
VERBOSE = 2
history = model.fit(X_train, y_train,
batch_size=BATCH_SIZE,
epochs=EPOCHS, verbose=VERBOSE,
validation_data=(X_test, y_test)
#validation_split=0.1,
#callbacks=[EarlyStopping(monitor='val_loss')]
)
In [23]:
print('Точность модели составляет: {}'.format(model.evaluate(X_test, y_test, batch_size=32, verbose=2)[1] * 100))
In [153]:
from matplotlib import pyplot as plt
print(history.history.keys())
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.show();
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.show();
In [99]:
def save_model(model_name):
# Генерируем описание модели в формате json
model_json = model.to_json()
# Записываем модель в файл
json_file = open("model/{}_model.json".Format(model_name), "w")
json_file.write(model_json)
json_file.close()
model.save_weights("model/{}_weights.h5".Format(model_name))
In [100]:
#save_model(habra_86persent)
In [ ]: