In [286]:
import os
import io
import numpy as np
import random
import re
In [287]:
def unique(arr, dic=None):
if (dic is None):
dic = {}
for el in arr:
if isinstance(el, list):
unique(el, dic)
else:
if (el not in dic):
dic[el] = 1
else:
dic[el] += 1
return np.array(dic.keys())
Классификация будет происходить по след формуле: $$p(c\mid d,\lambda)=\frac {\exp\sum_i^{n \times k}{\lambda_i}f_i\left(c,d\right )} {\sum_{\tilde{c}\in C}{\exp\sum_i^{n \times k}{\lambda_i}f_i\left(\tilde{c},d\right )}}$$
In [288]:
def predict(x, weights, feature_patterns, n_classes):
# начальное приведение
probas = np.ones(n_classes) * np.log(1.0 / n_classes)
# считаем условные вероятности
for xi in x:
for i in xrange(len(feature_patterns[xi])):
probas[feature_patterns[xi][i]] += weights[xi][i]
# далее сглаживаем выходы через softmax
probas = np.exp(probas / n_classes)
return probas / np.sum(probas)
Задачу будем решать с помощью максимизации функции правдоподобия $$\log p(C|D,\lambda) =\sum_{(c,d)\in(C,D)}\log p(c|d,\lambda) =\sum_{(c,d)\in(C,D)}\log\frac {\exp\sum_i^{n \times k}{\lambda_i}f_i\left(c,d\right )} {\sum_{\tilde{c}\in C}{\exp\sum_i^{n \times k}{\lambda_i}f_i\left(\tilde{c},d\right )}}$$
Соответственно градиент у нас будет в частных производных
$$\frac{\partial\log p(C|D,\lambda)}{\partial\lambda_i}= \sum_{(c,d)\in(C,D)}{f_i(c,d)}- \sum_{d\in D}{\sum_{c\in C}{p(c|d,\lambda)f_i(c,d)}}$$итого: $$w^{k+1} = w^{k} + \eta_k\frac{\partial}{\partial w_i}(L(j,w) - \frac{C}{N}|w_i|)$$
In [289]:
# L(j,w)
def apply_l1_penalty(i, j, u, weights, q):
z = weights[i][j]
if weights[i][j] > 0:
weights[i][j] = max(0.0, weights[i][j] - (u + q[i][j]))
elif weights[i][j] < 0:
weights[i][j] = max(0.0, weights[i][j] + (u - q[i][j]))
q[i][j] = weights[i][j] - z
In [290]:
def fit(X, y, f_count, c_count, batch_size=64, nb_epoch=10, alpha=0.85, max_iter=100, tol=0.00001, l1param=0.1, random_state=None, verbose=1):
'''
X - контекст
y - цель
f_count - количество уникальных признаков в контексте
c_count - количество уникальных целей, по факту искомые классы
batch_size - размер подвыборки для обучения
alpha - быстрота обучения
max_iter - макс кол-во итераций
tol - порог смещения градиента
l1param - параметр l1 регуляризации
random_state - для репродуцирования эксперимента
verbose - дебаг
'''
n_samples = len(X)
if batch_size is None:
batch_size = n_samples
if batch_size > n_samples:
batch_size = n_samples
if random_state is not None:
random.seed(random_state)
# # определяем сколько у нас уникальных токенов
# features = unique(X)
# f_count = features.shape[0]
# # определяем сколько у нас уникальных классов
# classes = unique(y)
# c_count = classes.shape[0]
# матрица индикаторов(условных признаков)
feature_patterns = [[] for _ in range(f_count)]
f_pattern_set = {}
# матрица весов индикаторов
weights = [[] for _ in range(f_count)]
q = [[] for _ in range(f_count)]
# инициализация индикаторов
for i in range(n_samples):
for xi in X[i]:
if xi not in f_pattern_set:
f_pattern_set[xi] = set()
if y[i] not in f_pattern_set[xi]:
f_pattern_set[xi].add(y[i])
weights[xi].append(0.0)
q[xi].append(0.0)
feature_patterns[xi].append(y[i])
prev_logl = 0.
iter_num = 0
all_iter = 0
u = 0.0
for epoch in range(nb_epoch):
if verbose:
print 'Start epoch #%d\t' % epoch,
# SGD
# ограничим сверху max_iter итерациями
loss = 0.
for iter_num in range(max_iter):
if verbose and (iter_num % (max_iter / 20) == 0):
print '.',
logl = 0.
ncorrect = 0
r = range(n_samples)
r = random.sample(r, batch_size)
iter_sample = 0
for i in r:
iter_sample += 1
all_iter += 1
eta = alpha ** (all_iter / n_samples)
# предсказываем вероятности
probas = predict(X[i], weights, feature_patterns, c_count)
# смотрим, правильно ли мы предсказали, это нужно только для verbose
if np.argmax(probas) == y[i]:
ncorrect += 1
# считаем "правдоподобие"
logl += np.log(probas[y[i]])
u += eta * l1param;
# обновляем веса
for j in range(len(X[i])):
conditional_y = feature_patterns[X[i][j]]
for y_i in xrange(len(conditional_y)):
# ожидание
expected_ent = 1.0 if conditional_y[y_i] == y[i] else 0.0
# реальность
max_ent = probas[conditional_y[y_i]]
weights[X[i][j]][y_i] -= 1.0 * (max_ent - expected_ent) * eta
apply_l1_penalty(X[i][j],y_i,u,weights,q)
loss += (logl - prev_logl)
prev_logl = logl
if verbose:
print '\tLoss: %.8f' % (loss/max_iter)
return weights, feature_patterns
In [291]:
# небольшой тест
X = [[0, 1],
[2, 1],
[2, 3],
[2, 1],
[0, 1],
[2, 1, 4],
[2, 3, 4],
[2, 1, 5],
[0, 3, 5],
[0, 1, 5]]
y = [0, 0, 1, 1, 0, 1, 1, 0, 0, 0]
# определяем сколько у нас уникальных токенов
features = unique(X)
f_count = features.shape[0]
# определяем сколько у нас уникальных классов
classes = unique(y)
c_count = classes.shape[0]
weights, patterns = fit(X, y,f_count,c_count, random_state=241,l1param=0.00001,nb_epoch=1)
# print weights
# print patterns
pred = predict([0, 1], weights, patterns,c_count)
print pred
In [292]:
digits_regex = re.compile('\d')
punc_regex = re.compile('[\%\(\)\-\/\:\;\<\>\«\»\,]')
delim_regex = re.compile('([\.])\s+')
In [293]:
def read_and_tokenize(foldername):
'''
метод для считывания текстов из файлов папки
здесь применяется довольно простая токенизация
'''
word_counts = {}
tokenized_text = []
for path, subdirs, files in os.walk('data'):
for name in files:
filename = os.path.join(path, name)
with io.open(filename, 'r', encoding='utf-8') as data_file:
for line in data_file:
if len(line) < 50:
continue
text = digits_regex.sub(u'0', line.lower())
text = punc_regex.sub(u'', text)
text = delim_regex.sub(r' \1 ', text)
for word in text.split():
if not word:
continue
if word not in word_counts:
word_counts[word] = 1
else:
word_counts[word] += 1
tokenized_text.append(word)
word2index = {}
index2word = []
i = 0
filtered_text = []
for word in tokenized_text:
if word_counts[word] > 2:
if word not in word2index:
word2index[word] = i
index2word.append(word)
i += 1
filtered_text.append(word)
return filtered_text, word2index, index2word
In [294]:
def generate_train(tokenized_text, word2index,context_len = 4):
'''
метод для генерации обучающих данных
'''
X = []
y = []
for i, y_word in enumerate(tokenized_text):
x = []
for j in range(i - context_len, i):
if (j >= 0):
x_word = tokenized_text[j]
x.append(word2index[x_word])
if (len(x) > 0):
X.append(x)
y.append(word2index[y_word])
return X, y
In [295]:
tokenized_text, word2index, index2word = read_and_tokenize('data')
In [296]:
unique_words = len(index2word)
print 'all words:', len(tokenized_text)
print 'all unique words:', unique_words
In [297]:
context_len = 4
X,y = generate_train(tokenized_text, word2index,context_len=context_len)
In [298]:
weights, patterns = fit(X, y,unique_words,unique_words,random_state=241,verbose=1,batch_size=64, nb_epoch=2, l1param=0.0001,tol=0.000001)
In [299]:
test = [word2index[word] for word in [u'путин']]
last_index = index = test[-1]
print 'GENRATED TEXT:'
for i in range(20):
pred = predict(test, weights, patterns,unique_words)
indicies = pred.argsort()[::-1][:20]
for index in indicies:
if index in test:
continue
else:
break
last_index = int(index)
print index2word[index],
test.append(index)
if len(test) > context_len:
del test[0]