In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from nltk.stem.snowball import RussianStemmer
import sklearn
import string
from sklearn.feature_extraction.text import CountVectorizer
In [2]:
from scipy.stats import randint
MAX_FEATURES = 5000 # Максимальное кол-во признаков-слов для CountVectorizer
CAT_COLS = ['category', 'subcategory'] # факторизуемые колонки
TARGET_COLUMNS = ['title', 'description', 'attrs', ['title', 'description']] # колонки, для построения BagOfWords тиблиц
SEED = 8451 # Показатель рандома
MODEL_COLUMNS = ['price', 'phones_cnt', 'emails_cnt', 'urls_cnt', 'category', 'subcategory']
FOREST_TRAIN_PARAMETERS = {"max_depth": randint(low=1, high=15),
"max_features": ['sqrt', 'log2'],
"min_samples_leaf": [4, 8, 16, 32],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"],
}
In [3]:
train_data_raw = pd.DataFrame.from_csv('data/avito_train.tsv', sep='\t')
test_data_raw = pd.DataFrame.from_csv('data/avito_test.tsv', sep='\t')
In [4]:
train_data_raw.shape
Out[4]:
In [5]:
train_data_raw.head(1)
Out[5]:
In [7]:
train_data = train_data_raw[:50000]
test_data = test_data_raw[:10000]
.# просматриваем информацию в колонках for column in train_data.columns: print("{: <20} {:} {: >10}".format(column, train[column].dtype, len(train[column].unique())))
.# по таблице определяем категориальные string колонки cat_cols = ['category','subcategory']
import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder() train[cat_cols] = train[cat_cols].apply(lambda x: pd.factorize(x)[0])
In [5]:
# %load scripts/preprocessing.py
from nltk.stem.snowball import RussianStemmer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
# стемминг и знаки пунктуации
stemmer = RussianStemmer()
exclude = string.punctuation + string.digits
stopwords = set(stopwords.words("russian"))
vectorizer = None
# Преобразование строки в массив слов со стеммингом и lower()
def clear(text):
# pre. Проверка на нули в данных
text = str(text)
if text == "nan":
return []
# 1. Убираем не-буквы
temp = re.sub("[^a-zA-Z|^а-яА-Я]", " ", text)
# 2. Преобразуем в прописные и делим по словам
temp = temp.lower().split()
# 3. Стемминг и уборка стоп-слов
temp = [stemmer.stem(i) for i in temp if i not in stopwords]
temp = [i for i in temp if len(i) > 2]
return temp
def preprocessFeatures(df, vectorizers=None, columns=TARGET_COLUMNS, max_features=MAX_FEATURES):
data_type = -1; # -1: smth wrong; 0: тренировочные данные; 1: тестовые данные
if vectorizers == None:
print("PROCESSING TRAIN DATA")
vectorizers = dict() # полученные вектрорайзеры (только для тренировочной выборки)
data_type = 0
else:
print("PROCESSING TEST DATA")
data_type = 1
features = [] # результат предпроцессинга
for column in columns:
print("COLUMN: {0}".format(column))
# 1. Получаем очищенные данные и представляем строчкой
cleared = [] # список очищенных и преобразованных строк
if type(column) is str: # обработка одной колонки
cleared = [" ".join(clear(i)) for i in df[column]]
else: # обработка 2 колонок
temp = [series_.values for id_, series_ in df[column].iterrows()]
temp = [" ".join(clear(str(i) + str(j))) for i,j in temp]
cleared = cleared + temp
print(" - Cleared")
# 2. Если данные тестовые - то vectorizer для колонки уже есть, используем его. Если тренировочные - создаём.
if data_type == 0:
vect = CountVectorizer(analyzer="word",
tokenizer=None,
preprocessor=None,
stop_words=None,
max_features=max_features)
# 3a. Учим словарю и обрабатываем
features.append(vect.fit_transform(cleared)) # обучаем + преобразуем
vectorizers["".join(column)] = vect # запоминаем получившийся векторайзер
else:
# 3b. Просто обрабатываем данные
features.append(vectorizers["".join(column)].transform(cleared)) # просто преобразем
print(" - Processed\n")
return features, vectorizers
# Делает предсказания для ряда BagOfWords матриц на ряде моделей
def modelsPredicts(frames, models):
predictions = []
for i in range(len(models)):
model = models[i]
X = pd.DataFrame(frames[i].toarray())
y = model.predict(X)
predictions.append(y)
return predictions
# Добавление не использованных данных + выделение меток
def concatenateRemaining(df, predictions, model_columns = MODEL_COLUMNS, cat_cols=CAT_COLS):
# 1. Факторизация категориальных данных
df[cat_cols] = df[cat_cols].apply(lambda x: pd.factorize(x)[0])
X = pd.DataFrame(predictions).T
X = X.set_index(df.index)
for column in MODEL_COLUMNS:
X[column] = df[column]
return X
# Вычисление лучших параметров для покрывающей модели (над ост. признаками и результатами выч.)
def getCoveringMovelParams(X,y,
parameters=FOREST_TRAIN_PARAMETERS,
seed=SEED,
model=RandomForestClassifier(random_state=SEED),
):
grid_search = RandomizedSearchCV(model,
param_distributions=parameters,
n_iter=15, cv=5,
scoring='neg_mean_squared_error',
random_state=SEED,
verbose = 1)
grid_search.fit(X,y)
return grid_search.best_params_
# Обучение локальных моделей, покрывающей модели, векторайзера
def trainModel(df, cat_cols=CAT_COLS, max_features=MAX_FEATURES, target_columns=TARGET_COLUMNS, seed=SEED):
# 1. Получение матриц BagOgWords
sparse_frames, vectorizers = preprocessFeatures(df)
# 2. Обучаем модель_1[] (модели) для каждой матрицы
models = []
print("FITTING LOCAL MODELS")
i = 1
for fr in sparse_frames:
sgd_clf = SGDClassifier(random_state=seed, n_jobs=-1)
# X = pd.DataFrame(fr.toarray())
X = fr
y = df.is_blocked
sgd_clf.fit(X, y)
models.append(sgd_clf)
print(" -fitted {0} from {1}".format(i, len(sparse_frames)))
i = i + 1;
print()
# 3. Делаем предсказания модель_1[] каждой матрицы
predictions = modelsPredicts(sparse_frames, models)
# 4. Делаем сводную матрицу и добавляем столбцы категорий+доп.данных
X = concatenateRemaining(df, predictions)
y = df['is_blocked']
# 5. По сводной матрице обучаем модель_2
best_params = getCoveringMovelParams(X,y)
covering_model = RandomForestClassifier(**best_params, n_jobs=-1).fit(X,y)
return vectorizers, models, covering_model
In [11]:
###
### Полный процесс предобработки данных
###
from sklearn.linear_model import SGDClassifier
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
# 1. Обучение модели
vectorizers, models, covering_model = trainModel(train_data)
# 2. Обработка данных
pre_data = preprocessFeatures(test_data, vectorizers)[0]
# 3. Предсказания частных моделей
priv_predictions = modelsPredicts(pre_data,models)
# 4. Получение сводной матрицы
X = concatenateRemaining(test_data, priv_predictions)
# 5. Предсказание покрывающей модели
test_predictions = covering_model.predict(X)
pasta = X
pasta['is_blocked'] = test_predictions
In [13]:
X = X.drop('is_blocked', 1)
covering_model.score(X, y)
# pasta
In [ ]:
###
### Полный процесс предобработки данных
###
from sklearn.linear_model import SGDClassifier
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
train = pd.DataFrame.from_csv('data/avito_train.tsv', sep='\t')
test = pd.DataFrame.from_csv('data/avito_test.tsv', sep='\t')
In [6]:
train = train[:50000]
# 1. Обучение модели
vectorizers, models, covering_model = trainModel(train)
# 2. Обработка данных
pre_data = preprocessFeatures(test, vectorizers)[0]
# 3. Предсказания частных моделей
priv_predictions = modelsPredicts(pre_data,models)
# 4. Получение сводной матрицы
X = concatenateRemaining(test, priv_predictions)
# 5. Предсказание покрывающей модели
test_predictions = covering_model.predict(X)
# pasta = X
# pasta['is_blocked'] = test_predictions
In [52]:
###
### Полный процесс предобработки данных
###
# from sklearn.linear_model import SGDClassifier
# from scipy.stats import randint
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import RandomizedSearchCV
# 1. Факторизация категориальных данных
train_data[CAT_COLS] = train_data[CAT_COLS].apply(lambda x: pd.factorize(x)[0])
test_data[CAT_COLS] = test_data[CAT_COLS].apply(lambda x: pd.factorize(x)[0])
# 2. Получение матриц BagOgWords
# temp = [clearColumn(train_data, test_data, i) for i in TARGET_COLUMNS]
sparse_frames_train, vectorizers = preprocessFeatures(train_data)
sparse_frames_test = preprocessFeatures(test_data, vectorizers)[0]
# 3. Обучаем модель_1[] (модели) для каждой матрицы
models = []
for fr in sparse_frames_train:
sgd_clf = SGDClassifier(random_state=SEED)
X = pd.DataFrame(fr.toarray())
y = train_data.is_blocked
sgd_clf.fit(X, y)
models.append(sgd_clf)
# 4. Делаем предсказания модель_1[] каждой матрицы
predictions = []
for i in range(len(models)):
model = models[i]
X = pd.DataFrame(sparse_frames_train[i].toarray())
y = model.predict(X)
predictions.append(y)
# 5. Делаем сводную матрицу и добавляем столбцы категорий+доп.данных
summary = pd.DataFrame(predictions).T
summary = summary.set_index(train_data.index)
summary['phones_cnt'] = train_data['phones_cnt']
summary['price'] = train_data['price']
summary['emails_cnt'] = train_data['emails_cnt']
summary['urls_cnt'] = train_data['urls_cnt']
summary['close_hours'] = train_data['close_hours']
summary['category'] = train_data['category']
summary['subcategory'] = train_data['subcategory']
y_train = train_data['is_blocked']
# 6. По сводной матрице обучаем модель_2
forest=RandomForestClassifier(n_estimators=10, random_state=SEED)
param_grid = {"max_depth": randint(low=1, high=15),
# "max_features": ['sqrt', 'log2'],
# "min_samples_leaf": [4, 8, 16, 32, 64, 128],
# "bootstrap": [True, False],
"criterion": ["gini", "entropy"],
}
grid_search = RandomizedSearchCV(forest, param_distributions=param_grid,
n_iter=15, cv=5, scoring='neg_mean_squared_error', random_state=SEED, verbose = 1)
grid_search.fit(summary, y_train)
forest_params = grid_search.best_params_
# 7. По сводной матрице делаем предсказание модель_2
clf = RandomForestClassifier(**forest_params)
clf.fit(summary, y_train)
pasta = summary
pasta['is_blocked'] = clf.predict(summary)
In [27]:
# summary = summary.drop('is_blocked', 1)
clf.score(summary, y_train)
Out[27]:
In [61]:
# полный процесс предсказания по тестовым данным
train_data_2 = train_data_raw[10000:20000]
train_data_2[CAT_COLS] = train_data_2[CAT_COLS].apply(lambda x: pd.factorize(x)[0])
sparse_frames_train_2 = preprocessFeatures(train_data_2, vectorizers)[0]
predictions_2 = []
for i in range(len(models)):
model = models[i]
X = pd.DataFrame(sparse_frames_train_2[i].toarray())
y = model.predict(X)
predictions_2.append(y)
summary_2 = pd.DataFrame(predictions_2).T
summary_2 = summary_2.set_index(train_data_2.index)
summary_2['phones_cnt'] = train_data_2['phones_cnt']
summary_2['price'] = train_data_2['price']
summary_2['emails_cnt'] = train_data_2['emails_cnt']
summary_2['urls_cnt'] = train_data_2['urls_cnt']
summary_2['close_hours'] = train_data_2['close_hours']
summary_2['category'] = train_data_2['category']
summary_2['subcategory'] = train_data_2['subcategory']
y_train_2 = train_data_2['is_blocked']
clf.score(summary_2, y_train_2)
Out[61]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [47]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
temp = pd.DataFrame(sparse_frames[0].toarray())
In [40]:
params = list()
from scipy.stats import randint
forest=RandomForestClassifier(n_estimators=10, random_state=SEED)
param_grid = {"max_depth": randint(low=1, high=15),
"max_features": ['sqrt', 'log2'],
"min_samples_leaf": [4, 8, 16, 32, 64, 128],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# }
i = 0
for fr in sparse_frames_train[0: 2]:
print("##### Started: ", TARGET_COLUMNS[i])
i += 1
X = pd.DataFrame(fr.toarray())
y = df.is_blocked
grid_search = RandomizedSearchCV(forest, param_distributions=param_grid,
n_iter=15, cv=5, scoring='neg_mean_squared_error', random_state=45426)
grid_search.fit(X, y)
params.append(grid_search.best_params_)
In [41]:
params
Out[41]:
In [61]:
sgd_clf = SGDClassifier(random_state=42)
predictions = []
for fr in sparse_frames[0: 1]:
X = pd.DataFrame(fr.toarray())
y = df.is_blocked
sgd_clf.fit(X, y)
predictions
In [50]:
scores
Out[50]:
In [ ]:
In [42]:
clf = RandomForestClassifier(n_estimators=10)
clf.predict_proba()
# scores = cross_val_score(clf, X_train, y_train, cv=5)
# scores
In [ ]:
target.value_counts()
=9322/10000
In [ ]:
data_test = pd.DataFrame.from_csv('data/avito_test.tsv', sep='\t')
data_test.head(1)
In [ ]:
import sys
sys.path.append('./scripts')
import preprocessing
In [ ]:
df_title = pd.DataFrame(preprocessing.frame(train,"title").toarray())
df_desc = pd.DataFrame(preprocessing.frame(train,"description").toarray())
df_attrs = pd.DataFrame(preprocessing.frame(train,"attrs").toarray())