In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from nltk.stem.snowball import RussianStemmer
import sklearn 
import string

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from scipy.stats import randint

MAX_FEATURES = 5000 # Максимальное кол-во признаков-слов для CountVectorizer
CAT_COLS = ['category', 'subcategory'] # факторизуемые колонки
TARGET_COLUMNS = ['title', 'description', 'attrs', ['title', 'description']] # колонки, для построения BagOfWords тиблиц
SEED = 8451 # Показатель рандома
MODEL_COLUMNS = ['price', 'phones_cnt', 'emails_cnt', 'urls_cnt', 'category', 'subcategory']
FOREST_TRAIN_PARAMETERS = {"max_depth": randint(low=1, high=15),
                  "max_features": ['sqrt', 'log2'],
                  "min_samples_leaf": [4, 8, 16, 32],
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"],
                 }

In [3]:
train_data_raw = pd.DataFrame.from_csv('data/avito_train.tsv', sep='\t')
test_data_raw = pd.DataFrame.from_csv('data/avito_test.tsv', sep='\t')

In [4]:
train_data_raw.shape


Out[4]:
(3995803, 12)

In [5]:
train_data_raw.head(1)


Out[5]:
category subcategory title description attrs price is_proved is_blocked phones_cnt emails_cnt urls_cnt close_hours
itemid
10000010 Транспорт Автомобили с пробегом Toyota Sera, 1991 Новая оригинальная линзованая оптика на ксенон... {"Год выпуска":"1991", "Тип кузова":"Купе", "П... 150000 NaN 0 0 0 0 0.03

In [7]:
train_data = train_data_raw[:50000]
test_data = test_data_raw[:10000]

.# просматриваем информацию в колонках for column in train_data.columns: print("{: <20} {:} {: >10}".format(column, train[column].dtype, len(train[column].unique())))

.# по таблице определяем категориальные string колонки cat_cols = ['category','subcategory']

import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder() train[cat_cols] = train[cat_cols].apply(lambda x: pd.factorize(x)[0])


In [5]:
# %load scripts/preprocessing.py
from nltk.stem.snowball import RussianStemmer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV


# стемминг и знаки пунктуации
stemmer = RussianStemmer()
exclude = string.punctuation + string.digits
stopwords = set(stopwords.words("russian"))

vectorizer = None

# Преобразование строки в массив слов со стеммингом и lower()
def clear(text):
    # pre. Проверка на нули в данных
    text = str(text)
    if text == "nan":
        return []
    # 1. Убираем не-буквы
    temp = re.sub("[^a-zA-Z|^а-яА-Я]", " ", text)
    # 2. Преобразуем в прописные и делим по словам
    temp = temp.lower().split()
    # 3. Стемминг и уборка стоп-слов
    temp = [stemmer.stem(i) for i in temp if i not in stopwords]
    temp = [i for i in temp if len(i) > 2]
    return temp

def preprocessFeatures(df, vectorizers=None, columns=TARGET_COLUMNS, max_features=MAX_FEATURES):
    data_type = -1; # -1: smth wrong; 0: тренировочные данные; 1: тестовые данные
    if vectorizers == None:
        print("PROCESSING TRAIN DATA")
        vectorizers = dict() # полученные вектрорайзеры (только для тренировочной выборки)
        data_type = 0
    else:
        print("PROCESSING TEST DATA")
        data_type = 1
    features = [] # результат предпроцессинга
    for column in columns:
        print("COLUMN: {0}".format(column))
        # 1. Получаем очищенные данные и представляем строчкой
        cleared = [] # список очищенных и преобразованных строк
        if type(column) is str: # обработка одной колонки
            cleared = [" ".join(clear(i)) for i in df[column]]
        else: # обработка 2 колонок
            temp = [series_.values for id_, series_ in df[column].iterrows()]
            temp = [" ".join(clear(str(i) + str(j))) for i,j in temp]
            cleared = cleared + temp
        print("  - Cleared")
        # 2. Если данные тестовые - то vectorizer для колонки уже есть, используем его. Если тренировочные - создаём.
        if data_type == 0:
            vect = CountVectorizer(analyzer="word",
                                         tokenizer=None,
                                         preprocessor=None,
                                         stop_words=None,
                                         max_features=max_features)
            # 3a. Учим словарю и обрабатываем
            features.append(vect.fit_transform(cleared)) # обучаем + преобразуем
            vectorizers["".join(column)] = vect # запоминаем получившийся векторайзер
        else:
            # 3b. Просто обрабатываем данные
            features.append(vectorizers["".join(column)].transform(cleared)) # просто преобразем
        print("  - Processed\n")
    return features, vectorizers

# Делает предсказания для ряда BagOfWords матриц на ряде моделей
def modelsPredicts(frames, models):
    predictions = []
    for i in range(len(models)):
        model = models[i]
        X = pd.DataFrame(frames[i].toarray())
        y = model.predict(X)
        predictions.append(y)
    return predictions

# Добавление не использованных данных + выделение меток
def concatenateRemaining(df, predictions, model_columns = MODEL_COLUMNS, cat_cols=CAT_COLS):
    # 1. Факторизация категориальных данных
    df[cat_cols] = df[cat_cols].apply(lambda x: pd.factorize(x)[0])
    X = pd.DataFrame(predictions).T
    X = X.set_index(df.index)
    for column in MODEL_COLUMNS:
        X[column] = df[column]
    return X

# Вычисление лучших параметров для покрывающей модели (над ост. признаками и результатами выч.)
def getCoveringMovelParams(X,y,
                       parameters=FOREST_TRAIN_PARAMETERS,
                       seed=SEED, 
                       model=RandomForestClassifier(random_state=SEED),
                      ):
    grid_search = RandomizedSearchCV(model, 
                                     param_distributions=parameters, 
                                     n_iter=15, cv=5, 
                                     scoring='neg_mean_squared_error', 
                                     random_state=SEED, 
                                     verbose = 1)
    grid_search.fit(X,y)
    return grid_search.best_params_

# Обучение локальных моделей, покрывающей модели, векторайзера
def trainModel(df, cat_cols=CAT_COLS, max_features=MAX_FEATURES, target_columns=TARGET_COLUMNS, seed=SEED):
    # 1. Получение матриц BagOgWords
    sparse_frames, vectorizers = preprocessFeatures(df)
    # 2. Обучаем модель_1[] (модели) для каждой матрицы
    models = []
    print("FITTING LOCAL MODELS")
    i = 1
    for fr in sparse_frames:
        sgd_clf = SGDClassifier(random_state=seed, n_jobs=-1)
#         X = pd.DataFrame(fr.toarray())
        X = fr
        y = df.is_blocked
        sgd_clf.fit(X, y)
        models.append(sgd_clf)
        print("  -fitted {0} from {1}".format(i, len(sparse_frames)))
        i = i + 1;
    print()
    # 3. Делаем предсказания модель_1[] каждой матрицы
    predictions = modelsPredicts(sparse_frames, models)
    # 4. Делаем сводную матрицу и добавляем столбцы категорий+доп.данных
    X = concatenateRemaining(df, predictions)
    y = df['is_blocked']
    # 5. По сводной матрице обучаем модель_2
    best_params = getCoveringMovelParams(X,y)
    covering_model = RandomForestClassifier(**best_params, n_jobs=-1).fit(X,y)
    return vectorizers, models, covering_model

In [11]:
###
### Полный процесс предобработки данных
###
from sklearn.linear_model import SGDClassifier
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# 1. Обучение модели
vectorizers, models, covering_model = trainModel(train_data)

# 2. Обработка данных
pre_data = preprocessFeatures(test_data, vectorizers)[0]

# 3. Предсказания частных моделей
priv_predictions = modelsPredicts(pre_data,models)

# 4. Получение сводной матрицы
X = concatenateRemaining(test_data, priv_predictions)

# 5. Предсказание покрывающей модели
test_predictions = covering_model.predict(X)
pasta = X
pasta['is_blocked'] = test_predictions


PROCESSING TRAIN DATA
COLUMN: title
  - Cleared
  - Processed

COLUMN: description
  - Cleared
  - Processed

COLUMN: attrs
  - Cleared
  - Processed

COLUMN: ['title', 'description']
  - Cleared
  - Processed

FITTING LOCAL MODELS
  -fitted 1 from 4
  -fitted 2 from 4
  -fitted 3 from 4
  -fitted 4 from 4

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    4.2s finished
PROCESSING TEST DATA
COLUMN: title
  - Cleared
  - Processed

COLUMN: description
  - Cleared
  - Processed

COLUMN: attrs
  - Cleared
  - Processed

COLUMN: ['title', 'description']
  - Cleared
  - Processed


In [13]:
X = X.drop('is_blocked', 1)
covering_model.score(X, y)
# pasta


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-db20e93e391a> in <module>()
      1 X = X.drop('is_blocked', 1)
----> 2 covering_model.score(X, y)
      3 # pasta

NameError: name 'y' is not defined

In [ ]:
###
### Полный процесс предобработки данных
###
from sklearn.linear_model import SGDClassifier
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

train = pd.DataFrame.from_csv('data/avito_train.tsv', sep='\t')
test = pd.DataFrame.from_csv('data/avito_test.tsv', sep='\t')

In [6]:
train = train[:50000]

# 1. Обучение модели
vectorizers, models, covering_model = trainModel(train)

# 2. Обработка данных
pre_data = preprocessFeatures(test, vectorizers)[0]

# 3. Предсказания частных моделей
priv_predictions = modelsPredicts(pre_data,models)

# 4. Получение сводной матрицы
X = concatenateRemaining(test, priv_predictions)

# 5. Предсказание покрывающей модели
test_predictions = covering_model.predict(X)
# pasta = X
# pasta['is_blocked'] = test_predictions


PROCESSING TRAIN DATA
COLUMN: title
  - Cleared
  - Processed

COLUMN: description
  - Cleared
  - Processed

COLUMN: attrs
  - Cleared
  - Processed

COLUMN: ['title', 'description']
  - Cleared
  - Processed

FITTING LOCAL MODELS
  -fitted 1 from 4
  -fitted 2 from 4
  -fitted 3 from 4
  -fitted 4 from 4

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   15.8s finished
PROCESSING TEST DATA
COLUMN: title
  - Cleared
  - Processed

COLUMN: description
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-c29e2228b57c> in <module>()
      5 
      6 # 2. Обработка данных
----> 7 pre_data = preprocessFeatures(test, vectorizers)[0]
      8 
      9 # 3. Предсказания частных моделей

<ipython-input-5-5bf4fa76a467> in preprocessFeatures(df, vectorizers, columns, max_features)
     47         cleared = [] # список очищенных и преобразованных строк
     48         if type(column) is str: # обработка одной колонки
---> 49             cleared = [" ".join(clear(i)) for i in df[column]]
     50         else: # обработка 2 колонок
     51             temp = [series_.values for id_, series_ in df[column].iterrows()]

<ipython-input-5-5bf4fa76a467> in <listcomp>(.0)
     47         cleared = [] # список очищенных и преобразованных строк
     48         if type(column) is str: # обработка одной колонки
---> 49             cleared = [" ".join(clear(i)) for i in df[column]]
     50         else: # обработка 2 колонок
     51             temp = [series_.values for id_, series_ in df[column].iterrows()]

<ipython-input-5-5bf4fa76a467> in clear(text)
     28     temp = temp.lower().split()
     29     # 3. Стемминг и уборка стоп-слов
---> 30     temp = [stemmer.stem(i) for i in temp if i not in stopwords]
     31     temp = [i for i in temp if len(i) > 2]
     32     return temp

<ipython-input-5-5bf4fa76a467> in <listcomp>(.0)
     28     temp = temp.lower().split()
     29     # 3. Стемминг и уборка стоп-слов
---> 30     temp = [stemmer.stem(i) for i in temp if i not in stopwords]
     31     temp = [i for i in temp if len(i) > 2]
     32     return temp

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/nltk/stem/snowball.py in stem(self, word)
   3050 
   3051             for suffix in self.__adjectival_suffixes:
-> 3052                 if rv.endswith(suffix):
   3053                     if suffix in ('i^ushchi^ui^u', 'i^ushchi^ai^a',
   3054                               'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u',

KeyboardInterrupt: 

In [52]:
###
### Полный процесс предобработки данных
###
# from sklearn.linear_model import SGDClassifier
# from scipy.stats import randint
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import RandomizedSearchCV

# 1. Факторизация категориальных данных
train_data[CAT_COLS] = train_data[CAT_COLS].apply(lambda x: pd.factorize(x)[0])
test_data[CAT_COLS] = test_data[CAT_COLS].apply(lambda x: pd.factorize(x)[0])

# 2. Получение матриц BagOgWords
# temp = [clearColumn(train_data, test_data, i) for i in TARGET_COLUMNS]
sparse_frames_train, vectorizers = preprocessFeatures(train_data)
sparse_frames_test = preprocessFeatures(test_data, vectorizers)[0]

# 3. Обучаем модель_1[] (модели) для каждой матрицы
models = []
for fr in sparse_frames_train:
    sgd_clf = SGDClassifier(random_state=SEED)
    X = pd.DataFrame(fr.toarray())
    y = train_data.is_blocked
    sgd_clf.fit(X, y)
    models.append(sgd_clf)

# 4. Делаем предсказания модель_1[] каждой матрицы
predictions = []
for i in range(len(models)):
    model = models[i]
    X = pd.DataFrame(sparse_frames_train[i].toarray())
    y = model.predict(X)
    predictions.append(y)

# 5. Делаем сводную матрицу и добавляем столбцы категорий+доп.данных
summary = pd.DataFrame(predictions).T
summary = summary.set_index(train_data.index)
summary['phones_cnt'] = train_data['phones_cnt']
summary['price'] = train_data['price']
summary['emails_cnt'] = train_data['emails_cnt']
summary['urls_cnt'] = train_data['urls_cnt']
summary['close_hours'] = train_data['close_hours']
summary['category'] = train_data['category']
summary['subcategory'] = train_data['subcategory']
y_train = train_data['is_blocked']

# 6. По сводной матрице обучаем модель_2
forest=RandomForestClassifier(n_estimators=10, random_state=SEED)
param_grid = {"max_depth": randint(low=1, high=15),
#               "max_features": ['sqrt', 'log2'],
#               "min_samples_leaf": [4, 8, 16, 32, 64, 128],
#               "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
             }

grid_search = RandomizedSearchCV(forest, param_distributions=param_grid, 
                                 n_iter=15, cv=5, scoring='neg_mean_squared_error', random_state=SEED, verbose = 1)
grid_search.fit(summary, y_train)
forest_params = grid_search.best_params_

# 7. По сводной матрице делаем предсказание модель_2
clf = RandomForestClassifier(**forest_params)
clf.fit(summary, y_train)
pasta = summary
pasta['is_blocked'] = clf.predict(summary)


PROCESSING TRAIN DATA
COLUMN: title
  - Cleared
  - Processed

COLUMN: description
  - Cleared
  - Processed

COLUMN: attrs
  - Cleared
  - Processed

COLUMN: ['title', 'description']
  - Cleared
  - Processed

PROCESSING TEST DATA
COLUMN: title
  - Cleared
  - Processed

COLUMN: description
  - Cleared
  - Processed

COLUMN: attrs
  - Cleared
  - Processed

COLUMN: ['title', 'description']
  - Cleared
  - Processed

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    7.5s finished

In [27]:
# summary = summary.drop('is_blocked', 1)
clf.score(summary, y_train)


Out[27]:
0.99429999999999996

In [61]:
# полный процесс предсказания по тестовым данным
train_data_2 = train_data_raw[10000:20000]

train_data_2[CAT_COLS] = train_data_2[CAT_COLS].apply(lambda x: pd.factorize(x)[0])
sparse_frames_train_2 = preprocessFeatures(train_data_2, vectorizers)[0]
predictions_2 = []
for i in range(len(models)):
    model = models[i]
    X = pd.DataFrame(sparse_frames_train_2[i].toarray())
    y = model.predict(X)
    predictions_2.append(y)

summary_2 = pd.DataFrame(predictions_2).T
summary_2 = summary_2.set_index(train_data_2.index)
summary_2['phones_cnt'] = train_data_2['phones_cnt']
summary_2['price'] = train_data_2['price']
summary_2['emails_cnt'] = train_data_2['emails_cnt']
summary_2['urls_cnt'] = train_data_2['urls_cnt']
summary_2['close_hours'] = train_data_2['close_hours']
summary_2['category'] = train_data_2['category']
summary_2['subcategory'] = train_data_2['subcategory']
y_train_2 = train_data_2['is_blocked']

clf.score(summary_2, y_train_2)


PROCESSING TEST DATA
COLUMN: title
  - Cleared
  - Processed

COLUMN: description
  - Cleared
  - Processed

COLUMN: attrs
  - Cleared
  - Processed

COLUMN: ['title', 'description']
  - Cleared
  - Processed

Out[61]:
0.96330000000000005

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [47]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier

temp = pd.DataFrame(sparse_frames[0].toarray())

In [40]:
params = list()
from scipy.stats import randint

forest=RandomForestClassifier(n_estimators=10, random_state=SEED)
param_grid = {"max_depth": randint(low=1, high=15),
              "max_features": ['sqrt', 'log2'],
              "min_samples_leaf": [4, 8, 16, 32, 64, 128],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
#              }
i = 0
for fr in sparse_frames_train[0: 2]:
    print("##### Started: ", TARGET_COLUMNS[i])
    i += 1
    X = pd.DataFrame(fr.toarray())
    y = df.is_blocked
    grid_search = RandomizedSearchCV(forest, param_distributions=param_grid, 
                                     n_iter=15, cv=5, scoring='neg_mean_squared_error', random_state=45426)
    grid_search.fit(X, y)
    params.append(grid_search.best_params_)


##### Started:  title
##### Started:  description

In [41]:
params


Out[41]:
[{'bootstrap': True,
  'criterion': 'entropy',
  'max_depth': 11,
  'max_features': 'sqrt',
  'min_samples_leaf': 128},
 {'bootstrap': True,
  'criterion': 'entropy',
  'max_depth': 9,
  'max_features': 'sqrt',
  'min_samples_leaf': 8}]

In [61]:
sgd_clf = SGDClassifier(random_state=42)
predictions = []

for fr in sparse_frames[0: 1]:  
    X = pd.DataFrame(fr.toarray())
    y = df.is_blocked
    sgd_clf.fit(X, y)
    predictions


[0]

In [50]:
scores


Out[50]:
array([ 0.95152424,  0.96051974,  0.959     ,  0.95947974,  0.95997999])

In [ ]:


In [42]:
clf = RandomForestClassifier(n_estimators=10)
clf.predict_proba()
# scores = cross_val_score(clf, X_train, y_train, cv=5)
# scores

In [ ]:
target.value_counts()

=9322/10000


In [ ]:
data_test = pd.DataFrame.from_csv('data/avito_test.tsv', sep='\t')
data_test.head(1)

In [ ]:
import sys
sys.path.append('./scripts')
import preprocessing

In [ ]:
df_title = pd.DataFrame(preprocessing.frame(train,"title").toarray())
df_desc = pd.DataFrame(preprocessing.frame(train,"description").toarray())
df_attrs = pd.DataFrame(preprocessing.frame(train,"attrs").toarray())