In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
Возьмите данные отсюда: https://inclass.kaggle.com/c/si650winter11
В этот раз это inclass-соревнование, а не настоящий kaggle. В задачах на текст Kaggle обычно не мелочится, и сгружает гигабайты в выборках. Конкретно же этот конкурс из университета Мичигана вполне легко разворачивается на домашнем компьютере.
In [3]:
import codecs
fileObj = codecs.open( 'data/TextWorks/training.txt', "r", "utf_8_sig" )
lines = fileObj.readlines()
#with open( 'data/TextWorks/training.txt'
# # Путь к вашему training.txt-файлу
# ) as handle:
# lines = handle.readlines()
data = [x.strip().split('\t') for x in lines]
df = pd.DataFrame(data=data, columns=['target', 'text'])
df.target = df.target.astype(np.int32)
df = df.drop_duplicates().reset_index(drop=True)
In [4]:
#with open( 'data/TextWorks/testdata.txt'
# # Путь к вашему test.txt-файлу
# ) as handle:
# lines = handle.readlines()
fileObj = codecs.open( 'data/TextWorks/testdata.txt', "r", "utf_8_sig" )
lines = fileObj.readlines()
data = [x.strip().split('\t') for x in lines]
df_test = pd.DataFrame(data=data, columns=['text'])
df_test = df_test.drop_duplicates().reset_index(drop=True)
In [5]:
df.head()
Out[5]:
In [7]:
#Загрузка модулей nltk через интерфейс проги
#import nltk
#nltk.download()
In [8]:
from nltk import word_tokenize, wordpunct_tokenize, sent_tokenize
In [9]:
s = df.text[1]
s
Out[9]:
In [10]:
import re
In [11]:
tokens = [x.lower() for x in word_tokenize(s) if re.match("[a-zA-Z\d]+", x) is not None]
tokens
Out[11]:
In [12]:
from nltk.corpus import stopwords
In [14]:
[print(x, end='\t') for x in stopwords.words('english')];
In [16]:
[print(x, end='\t') for x in stopwords.words('russian')];
In [17]:
throwed = [x for x in tokens if x in stopwords.words('english')]
throwed
Out[17]:
In [18]:
filtered_tokens = [x for x in tokens if x not in stopwords.words('english')]
filtered_tokens
Out[18]:
In [19]:
from nltk import WordNetLemmatizer
In [20]:
wnl = WordNetLemmatizer()
In [21]:
lemmatized = [wnl.lemmatize(x, pos='v') for x in filtered_tokens] # default is Verb
for a,b in zip(filtered_tokens, lemmatized):
print(a.rjust(10), '->', b.ljust(10), '' if a == b else '<- processed!')
In [22]:
lemmatized = [wnl.lemmatize(x) for x in filtered_tokens] # default is Noun
for a,b in zip(filtered_tokens, lemmatized):
print(a.rjust(10), '->', b.ljust(10), '' if a == b else '<- processed!')
In [23]:
#Сделаем разделение по частям речи
from nltk import pos_tag
In [24]:
pos_tag(filtered_tokens)
Out[24]:
In [25]:
from nltk.help import upenn_tagset
In [26]:
#Описание сокращений
#upenn_tagset()
In [27]:
from nltk.corpus import wordnet as wn
In [28]:
convert_tag = lambda t: { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ, 'S': wn.ADJ_SAT }.get(t[:1], wn.NOUN)
In [29]:
lemmatized = [wnl.lemmatize(word, convert_tag(tag)) for word, tag in pos_tag(filtered_tokens)]
for a,b in zip(filtered_tokens, lemmatized):
print(a.rjust(10), '->', b.ljust(10), '' if a == b else '<- processed!')
In [30]:
from nltk.stem import SnowballStemmer, LancasterStemmer, PorterStemmer
In [31]:
sbs = SnowballStemmer('english')
stemmed = [sbs.stem(x) for x in filtered_tokens]
for a,b in zip(filtered_tokens, lemmatized):
print(a.rjust(10), '->', b.ljust(10), '' if a == b else '<- processed!')
In [32]:
sbs = PorterStemmer()
stemmed = [sbs.stem(x) for x in filtered_tokens]
for a,b in zip(filtered_tokens, lemmatized):
print(a.rjust(10), '->', b.ljust(10), '' if a == b else '<- processed!')
In [33]:
sbs = LancasterStemmer()
stemmed = [sbs.stem(x) for x in filtered_tokens]
for a,b in zip(filtered_tokens, lemmatized):
print(a.rjust(10), '->', b.ljust(10), '' if a == b else '<- processed!')
In [34]:
sbs = SnowballStemmer('english')
def process_by_far(s):
s = [x.lower() for x in word_tokenize(s) if re.match("[a-zA-Z\d]+", x) is not None] # токенизация
s = [x for x in s if x not in stopwords.words('english')] # стоп-слова
s = [sbs.stem(x) for x in s] # стемминг
return ' '.join(s)
In [35]:
df['cleansed_text'] = df.text.apply(process_by_far)
In [36]:
df.head()
Out[36]:
In [37]:
from toolz.itertoolz import concat
In [38]:
all_tokens = list(concat(df.cleansed_text.str.split()))
In [39]:
from nltk.probability import FreqDist
In [40]:
fd = FreqDist(all_tokens)
In [41]:
fd.most_common(10)
Out[41]:
In [44]:
plt.figure(figsize=(22, 10));
fd.plot(100, cumulative=False)
In [45]:
fd.hapaxes()[:3] # - тут все слова, что встретились лишь единожды
Out[45]:
In [46]:
len(fd.keys()), len(fd.hapaxes())
Out[46]:
In [49]:
#удаляем редко встречающиеся слова
df['frequent_cleansed'] = df.cleansed_text.str.split()\
.apply(lambda ss: ' '.join([x for x in ss if x not in fd.hapaxes()]))
In [50]:
df.head()
Out[50]:
In [51]:
from sklearn.feature_extraction.text import CountVectorizer
In [58]:
cv = CountVectorizer()
X_bow = cv.fit_transform(df.frequent_cleansed).todense();
y = df.target
In [68]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_bow, y);
In [69]:
df.target[-4:]
Out[69]:
In [70]:
df.frequent_cleansed[-4:]
Out[70]:
In [71]:
df.frequent_cleansed[3:4]
Out[71]:
In [72]:
text = cv.transform(df.frequent_cleansed[3:4])
In [73]:
import eli5
In [74]:
eli5.explain_prediction_sklearn(clf, text, feature_names=list(cv.vocabulary_.keys()))
Out[74]:
In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [76]:
tfidf = TfidfVectorizer(min_df=1)
In [77]:
X_tfidf = tfidf.fit_transform(df.frequent_cleansed).todense()
In [78]:
idf = tfidf.idf_
terms_score = list(zip(tfidf.get_feature_names(), idf))
sorted(terms_score, key=lambda x: -x[1])[:20]
Out[78]:
In [82]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
scores = cross_val_score(clf, X_bow, y)
print(np.mean(scores), '+/-', 2 * np.std(scores))
In [83]:
from sklearn.ensemble import RandomForestClassifier
scores = cross_val_score(RandomForestClassifier(n_estimators=100), X_bow, y)
print(np.mean(scores), '+/-', 2 * np.std(scores))
In [84]:
# загрузим библиотеки и установим опции
from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
#%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
In [85]:
# загрузим обучающую и тестовую выборки
train_df = pd.read_csv('data/TextWorks/train_sessions.csv')#,index_col='session_id')
test_df = pd.read_csv('data/TextWorks/test_sessions.csv')#, index_col='session_id')
# приведем колонки time1, ..., time10 к временному формату
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)
# отсортируем данные по времени
train_df = train_df.sort_values(by='time1')
# посмотрим на заголовок обучающей выборки
train_df.head()
Out[85]:
In [86]:
sites = ['site%s' % i for i in range(1, 11)]
#заменим nan на 0
train_df[sites] = train_df[sites].fillna(0).astype('int').astype('str')
test_df[sites] = test_df[sites].fillna(0).astype('int').astype('str')
#создадим тексты необходимые для обучения word2vec
train_df['list'] = train_df['site1']
test_df['list'] = test_df['site1']
for s in sites[1:]:
train_df['list'] = train_df['list']+","+train_df[s]
test_df['list'] = test_df['list']+","+test_df[s]
train_df['list_w'] = train_df['list'].apply(lambda x: x.split(','))
test_df['list_w'] = test_df['list'].apply(lambda x: x.split(','))
In [87]:
#В нашем случае предложение это набор сайтов, которые посещал пользователь
#нам необязательно переводить цифры в названия сайтов, т.к. алгоритм будем выявлять взаимосвязь их друг с другом.
train_df['list_w'][10]
Out[87]:
In [89]:
# подключим word2vec
from gensim.models import word2vec
In [90]:
#объединим обучающую и тестовую выборки и обучим нашу модель на всех данных
#с размером окна в 6=3*2 (длина предложения 10 слов) и итоговыми векторами размерности 300, параметр workers отвечает за количество ядер
test_df['target'] = -1
data = pd.concat([train_df,test_df], axis=0)
model = word2vec.Word2Vec(data['list_w'], size=300, window=3, workers=4)
#создадим словарь со словами и соответсвующими им векторами
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
In [91]:
class mean_vectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
self.dim = len(next(iter(w2v.values())))
def fit(self, X):
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
])
In [92]:
data_mean=mean_vectorizer(w2v).fit(train_df['list_w']).transform(train_df['list_w'])
data_mean.shape
Out[92]:
In [93]:
# Воспользуемся валидацией
def split(train,y,ratio):
idx = round(train.shape[0] * ratio)
return train[:idx, :], train[idx:, :], y[:idx], y[idx:]
y = train_df['target']
Xtr, Xval, ytr, yval = split(data_mean, y,0.8)
Xtr.shape,Xval.shape,ytr.mean(),yval.mean()
Out[93]:
In [96]:
# подключим библиотеки keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input
from keras.preprocessing.text import Tokenizer
from keras import regularizers
In [97]:
# опишем нейронную сеть
model = Sequential()
model.add(Dense(128, input_dim=(Xtr.shape[1])))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['binary_accuracy'])
In [100]:
history = model.fit(Xtr, ytr,
batch_size=128,
epochs=10,
validation_data=(Xval, yval),
class_weight='auto',
verbose=0)
In [101]:
classes = model.predict(Xval, batch_size=128)
roc_auc_score(yval, classes)
Out[101]: